Coverage for python/lsst/ctrl/bps/wms/pegasus/pegasus_service.py : 3%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_bps.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22"""Support for using Pegasus WMS.
23"""
25__all__ = ["PegasusService", "PegasusWorkflow"]
28import os
29import copy
30import re
31import subprocess
32import shlex
33import shutil
34import logging
36from Pegasus.DAX3 import ADAG, File, Job, Link, PFN, Executable, Profile, Namespace
37from Pegasus.catalogs import replica_catalog, sites_catalog, transformation_catalog
39from ... import BaseWmsService, BaseWmsWorkflow
40from ...bps_utils import chdir
41from ..htcondor import HTCondorService, htc_write_attribs
44_LOG = logging.getLogger(__name__)
47class PegasusService(BaseWmsService):
48 """Pegasus version of workflow engine.
49 """
50 def prepare(self, config, generic_workflow, out_prefix=None):
51 """Create submission for a generic workflow in a specific WMS.
53 Parameters
54 ----------
55 config : `lsst.ctrl.bps.BpsConfig`
56 BPS configuration that includes necessary submit/runtime
57 information.
58 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
59 The generic workflow (e.g., has executable name and arguments)
60 out_prefix : `str`
61 The root directory into which all WMS-specific files are written.
63 Returns
64 ----------
65 peg_workflow : `lsst.ctrl.bps.wms.pegasus.PegasusWorkflow`
66 A workflow ready for Pegasus to run.
67 """
68 service_class = f"{self.__class__.__module__}.{self.__class__.__name__}"
69 peg_workflow = PegasusWorkflow.from_generic_workflow(config, generic_workflow, out_prefix,
70 service_class)
71 peg_workflow.write(out_prefix)
72 peg_workflow.run_pegasus_plan(out_prefix, generic_workflow.run_attrs)
73 return peg_workflow
75 def submit(self, workflow):
76 """Submit a single WMS workflow
78 Parameters
79 ----------
80 workflow : `lsst.ctrl.bps.BaseWorkflow`
81 A single HTCondor workflow to submit
82 """
83 with chdir(workflow.submit_path):
84 _LOG.info("Submitting from directory: %s", os.getcwd())
85 command = f"pegasus-run {workflow.run_id}"
86 with open(f"{workflow.name}_pegasus-run.out", "w") as outfh:
87 process = subprocess.Popen(shlex.split(command), shell=False, stdout=outfh,
88 stderr=subprocess.STDOUT)
89 process.wait()
91 if process.returncode != 0:
92 raise RuntimeError("pegasus-run exited with non-zero exit code (%s)" % process.returncode)
94 # Note:
95 #
96 # No need to save run id as the same as the run id generated when
97 # running pegasus-plan earlier.
99 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None):
100 """Query WMS for list of submitted WMS workflows/jobs.
102 This should be a quick lookup function to create list of jobs for
103 other functions.
105 Parameters
106 ----------
107 wms_id : `int` or `str`, optional
108 Id or path that can be used by WMS service to look up job.
109 user : `str`, optional
110 User whose submitted jobs should be listed.
111 require_bps : `bool`, optional
112 Whether to require jobs returned in list to be bps-submitted jobs.
113 pass_thru : `str`, optional
114 Information to pass through to WMS.
116 Returns
117 -------
118 job_ids : `list` [`Any`]
119 Only job ids to be used by cancel and other functions. Typically
120 this means top-level jobs (i.e., not children jobs).
121 """
122 htc_service = HTCondorService(self.config)
123 return htc_service.list_submitted_jobs(wms_id, user, require_bps, pass_thru)
125 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None):
126 """Query WMS for status of submitted WMS workflows
127 Parameters
128 ----------
129 wms_workflow_id : `int` or `str`, optional
130 Id that can be used by WMS service to look up status.
131 user : `str`, optional
132 Limit report to submissions by this particular user
133 hist : `int`, optional
134 Number of days to expand report to include finished WMS workflows.
135 pass_thru : `str`, optional
136 Additional arguments to pass through to the specific WMS service.
138 Returns
139 -------
140 run_reports : `list` [`lsst.ctrl.bps.BaseWmsReport`]
141 Status information for submitted WMS workflows
142 message : `str`
143 Message to user on how to find more status information specific to
144 WMS.
145 """
146 htc_service = HTCondorService(self.config)
147 return htc_service.report(wms_workflow_id, user, hist, pass_thru)
149 def cancel(self, wms_id, pass_thru=None):
150 """Cancel submitted workflows/jobs.
152 Parameters
153 ----------
154 wms_id : `str`
155 ID or path of job that should be canceled.
156 pass_thru : `str`, optional
157 Information to pass through to WMS.
159 Returns
160 --------
161 deleted : `bool`
162 Whether successful deletion or not. Currently, if any doubt or any
163 individual jobs not deleted, return False.
164 message : `str`
165 Any message from WMS (e.g., error details).
166 """
167 _LOG.debug("Canceling wms_id = %s", wms_id)
169 # if wms_id is a numeric HTCondor id, use HTCondor plugin to delete
170 try:
171 float(wms_id)
172 htc_service = HTCondorService(self.config)
173 deleted, message = htc_service.cancel(wms_id, pass_thru)
174 except ValueError:
175 command = f"pegasus-remove {wms_id}"
176 _LOG.debug(command)
177 completed_process = subprocess.run(shlex.split(command), shell=False, check=False,
178 stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
179 _LOG.debug(completed_process.stdout)
180 _LOG.debug("Return code = %s", completed_process.returncode)
182 if completed_process.returncode != 0:
183 deleted = False
184 m = re.match(b"443", completed_process.stdout)
185 if m:
186 message = "no such bps job in batch queue"
187 else:
188 message = f"pegasus-remove exited with non-zero exit code {completed_process.returncode}"
189 print("XXX", completed_process.stdout.decode(), "XXX")
190 print(message)
191 else:
192 deleted = True
194 return deleted, message
197class PegasusWorkflow(BaseWmsWorkflow):
198 """Single Pegasus Workflow
200 Parameters
201 ----------
202 name : `str`
203 Name of workflow.
204 config : `lsst.ctrl.bps.BpsConfig`
205 BPS configuration that includes necessary submit/runtime information.
206 """
208 def __init__(self, name, config):
209 # config, run_id, submit_path
210 super().__init__(name, config)
211 self.dax = ADAG(name)
212 self.run_attrs = None
214 self.replica_catalog = None
215 self.sites_catalog = None
216 self.transformation_catalog = None
217 self._init_catalogs()
218 self.properties_filename = None
219 self.dax_filename = None
221 def _init_catalogs(self):
222 # Set workdir in catalogs at write time. So pass None as value here.
224 # Replica Catalog keeps mappings of logical file ids/names (LFN's) to
225 # physical file ids/names (PFN's)
226 if "rcFile" not in self.config:
227 fname = "rc.txt"
228 self.replica_catalog = replica_catalog.ReplicaCatalog(None, fname)
230 # Transformation Catalog describes all of the executables
231 # (called "transformations") used by the workflow.
232 if "tcFile" not in self.config:
233 fname = "tc.txt"
234 self.transformation_catalog = transformation_catalog.TransformationCatalog(None, fname)
236 # Note:
237 #
238 # SitesCatalog needs workdir at initialization to create local site
239 # for submit side directory where the output data from the workflow
240 # will be stored. So delaying creation of SitesCatalog until all the
241 # write function is called with a given output directory.
243 @classmethod
244 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
245 # Docstring inherited.
246 peg_workflow = cls(generic_workflow.name, config)
247 peg_workflow.run_attrs = copy.deepcopy(generic_workflow.run_attrs)
248 peg_workflow.run_attrs["bps_wms_service"] = service_class
249 peg_workflow.run_attrs["bps_wms_workflow"] = f"{cls.__module__}.{cls.__name__}"
251 # Create initial Pegasus File objects for all files that WMS must
252 # handle.
253 peg_files = {}
254 for gwf_file in generic_workflow.get_files(data=True, transfer_only=True):
255 if gwf_file.wms_transfer:
256 peg_file = File(gwf_file.name)
257 peg_file.addPFN(PFN(f"file://{gwf_file.src_uri}", "local"))
258 peg_files[gwf_file.name] = peg_file
260 # Add jobs to the DAX.
261 for job_name in generic_workflow:
262 gwf_job = generic_workflow.get_job(job_name)
263 job = peg_workflow.create_job(generic_workflow, gwf_job, peg_files)
264 peg_workflow.dax.addJob(job)
266 # Add job dependencies to the DAX.
267 for job_name in generic_workflow:
268 for child_name in generic_workflow.successors(job_name):
269 peg_workflow.dax.depends(parent=peg_workflow.dax.getJob(job_name),
270 child=peg_workflow.dax.getJob(child_name))
272 return peg_workflow
274 def create_job(self, generic_workflow, gwf_job, peg_files):
275 """Create a Pegasus job corresponding to the given GenericWorkflow job.
277 Parameters
278 ----------
279 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
280 Generic workflow that is being converted.
281 gwf_job : `lsst.ctrl.bps.GenericWorkflowJob`
282 The generic job to convert to a Pegasus job.
283 peg_files : `dict` [`str`, `Pegasus.DAX3.File`]
284 Pegasus Files needed when creating Pegasus Job.
286 Returns
287 -------
288 job : `Pegasus.DAX3.Job`
289 Pegasus job created from the generic workflow job.
291 Notes
292 -----
293 https://pegasus.isi.edu/documentation/reference-guide/variable
294 -expansion.html Says that ${VAR} gets expanded with submit side
295 values during pegasus-plan. If try $VAR (which isn't supposed to get
296 expanded by pegasus-plan), the environment variable (e.g.,
297 ${CTRL_MPEXEC_DIR} gets completely dropped from the executable path
298 and job dies because cannot find executable (/bin/pipetask).
300 So, currently Pegasus plugin only works if environment variables used
301 in commands are same on submit machine and compute machine.
302 """
303 _LOG.debug("GenericWorkflowJob=%s", gwf_job)
305 # Save transformation.
306 executable = Executable(gwf_job.executable.name,
307 installed=not gwf_job.executable.transfer_executable)
308 newexec = re.sub(r"<ENV:([^>]+)>", r"${\1}", gwf_job.executable.src_uri)
309 _LOG.debug("Executable after replacing any environment variables = %s", newexec)
310 executable.addPFN(PFN(f"file://{newexec}", gwf_job.compute_site))
311 self.transformation_catalog.add(executable)
313 # Create Pegasus Job.
314 job = Job(gwf_job.executable.name, id=gwf_job.name, node_label=gwf_job.label)
316 if gwf_job.arguments:
317 arguments = gwf_job.arguments
318 # Replace command variables
319 arguments = arguments.format(**gwf_job.cmdvals)
321 # Replace env vars
322 arguments = re.sub(r"<ENV:([^>]+)>", r"${\1}", arguments)
323 _LOG.debug("Arguments after replacing any environment variables = %s", arguments)
325 # Replace file placeholders
326 arguments = re.sub(r"<FILE:([^>]+)>", r"\1", arguments)
327 _LOG.debug("Command line arguments: %s", arguments)
329 # Break up command string into separate args for Pegasus Job object
330 # replacing file names with Pegasus File objects
331 args = arguments.split()
332 logical_file_names = list(set(peg_files) & set(args))
333 if logical_file_names:
334 indices = [args.index(lfn) for lfn in logical_file_names]
335 for idx, lfn in zip(indices, logical_file_names):
336 args[idx] = peg_files[lfn]
338 job.addArguments(*args)
339 else:
340 _LOG.warning("Job %s does not have any arguments", gwf_job.name)
342 if gwf_job.request_memory: # MB
343 job.addProfile(Profile(Namespace.CONDOR, "request_memory", gwf_job.request_memory))
344 if gwf_job.request_cpus: # cores
345 job.addProfile(Profile(Namespace.CONDOR, "request_cpus", gwf_job.request_cpus))
346 if gwf_job.request_disk: # MB
347 job.addProfile(Profile(Namespace.CONDOR, "request_disk", gwf_job.request_disk))
348 if gwf_job.priority: # MB
349 job.addProfile(Profile(Namespace.CONDOR, "priority", gwf_job.priority))
351 # Add extra job attributes
352 for key, value in gwf_job.profile.items():
353 job.addProfile(Profile(Namespace.CONDOR, key, value))
355 for key, value in gwf_job.environment.items():
356 job.addProfile(Profile(Namespace.ENV, key, value))
358 # Add run attributes
359 for key, value in self.run_attrs.items():
360 job.addProfile(Profile(Namespace.CONDOR, key=f"+{key}", value=f'"{value}"'))
362 for key, value in gwf_job.attrs.items():
363 _LOG.debug("create_job: attrs = %s", gwf_job.attrs)
364 job.addProfile(Profile(Namespace.CONDOR, key=f"+{key}", value=f'"{value}"'))
366 job.addProfile(Profile(Namespace.CONDOR, key="+bps_job_name", value=f'"{gwf_job.name}"'))
367 job.addProfile(Profile(Namespace.CONDOR, key="+bps_job_label", value=f'"{gwf_job.label}"'))
368 if "quanta_summary" in gwf_job.tags:
369 job.addProfile(Profile(Namespace.CONDOR, key="+bps_job_quanta",
370 value=f"\"{gwf_job.tags['quanta_summary']}\""))
372 # Specify job's inputs.
373 for gwf_file in generic_workflow.get_job_inputs(gwf_job.name, data=True, transfer_only=True):
374 peg_file = peg_files[gwf_file.name]
375 job.uses(peg_file, link=Link.INPUT)
376 for pfn in peg_file.pfns:
377 self.replica_catalog.add(peg_file.name, pfn.url, pfn.site)
379 # Specify job's outputs
380 for gwf_file in generic_workflow.get_job_outputs(gwf_job.name, data=True, transfer_only=True):
381 peg_file = peg_files[gwf_file.name]
382 job.uses(peg_file, link=Link.OUTPUT)
383 for pfn in peg_file.pfns:
384 self.replica_catalog.add(peg_file.name, pfn.url, pfn.site)
386 return job
388 def _define_sites(self, out_prefix):
389 """Create Pegasus Site Catalog
391 Parameters
392 ----------
393 out_prefix : `str`
394 Directory prefix for the site catalog file.
396 Notes
397 -----
398 SitesCatalog needs workdir at initialization to create local site for
399 submit side directory where the output data from the workflow will be
400 stored.
401 """
402 self.sites_catalog = sites_catalog.SitesCatalog(out_prefix, f"{self.name}_sites.xml")
404 # Adding information for all sites defined in config instead of
405 # limiting to those actually used by the workflow
406 for site, site_data in self.config["site"].items():
407 self.sites_catalog.add_site(site, arch=site_data["arch"], os=site_data["os"])
408 if "directory" in site_data:
409 # Workaround because no Python API
410 dir_dict = {}
411 for site_dir in site_data["directory"]:
412 dir_dict[site_dir] = {"path": site_data["directory"][site_dir]["path"]}
413 self.sites_catalog._sites[site]["directories"] = dir_dict
415 # add config provided site attributes
416 if "profile" in site_data:
417 for pname, pdata in site_data["profile"].items():
418 for key, val in pdata.items():
419 self.sites_catalog.add_site_profile(site, namespace=pname, key=key, value=val)
420 self.sites_catalog.add_site_profile(site, namespace=Namespace.DAGMAN, key="NODE_STATUS_FILE",
421 value=f"{self.name}.node_status")
423 def write(self, out_prefix):
424 """Write Pegasus Catalogs and DAX to files.
426 Parameters
427 ----------
428 out_prefix : `str`
429 Directory prefix for all the Pegasus workflow files.
430 """
431 self.submit_path = out_prefix
433 # filenames needed for properties file
434 filenames = {}
436 # Write down the workflow in DAX format.
437 self.dax_filename = f"{self.dax.name}.dax"
438 if out_prefix is not None:
439 os.makedirs(out_prefix, exist_ok=True)
440 self.dax_filename = os.path.join(out_prefix, self.dax_filename)
441 with open(self.dax_filename, "w") as outfh:
442 self.dax.writeXML(outfh)
444 # output site catalog
445 filename = f"{self.name}_sites.xml"
446 if "scFile" not in self.config:
447 self._define_sites(out_prefix)
448 self.sites_catalog.workflow_dir = out_prefix
449 self.sites_catalog.filename = filename
450 self.sites_catalog.write()
451 else:
452 shutil.copy(self.config["sitesFile"], os.path.join(self.submit_path, filename))
453 filenames["sites"] = filename
455 # output transformation catalog
456 filename = f"{self.name}_tc.txt"
457 if self.transformation_catalog is not None:
458 self.transformation_catalog.workflow_dir = out_prefix
459 self.transformation_catalog.filename = filename
460 self.transformation_catalog.write()
461 else:
462 shutil.copy(self.config["tcFile"], os.path.join(self.submit_path, filename))
463 filenames["transformation"] = filename
465 # output replica catalog
466 filename = f"{self.name}_rc.txt"
467 if self.replica_catalog is not None:
468 self.replica_catalog.workflow_dir = out_prefix
469 self.replica_catalog.filename = filename
470 self.replica_catalog.write()
471 else:
472 shutil.copy(self.config["tcFile"], os.path.join(self.submit_path, filename))
473 filenames["replica"] = filename
475 self.properties_filename = self._write_properties_file(out_prefix, filenames)
477 def run_pegasus_plan(self, out_prefix, run_attr):
478 """Execute pegasus-plan to convert DAX to HTCondor DAG for submission.
480 Parameters
481 ----------
482 out_prefix : `str`
483 Root directory in which to output all files.
484 run_attr : `dict`
485 Attributes to add to main DAG.
486 """
487 cmd = f"pegasus-plan --verbose --conf {self.properties_filename} --dax {self.dax_filename} --dir " \
488 f"{out_prefix}/peg --cleanup none --sites {self.config['computeSite']} " \
489 f"--input-dir {out_prefix}/input --output-dir {out_prefix}/output"
490 _LOG.debug("Plan command: %s", cmd)
491 pegout = f"{self.submit_path}/{self.name}_pegasus-plan.out"
492 with chdir(self.submit_path):
493 _LOG.debug("pegasus-plan in directory: %s", os.getcwd())
494 _LOG.debug("pegasus-plan output in %s", pegout)
495 with open(pegout, "w") as pegfh:
496 print(f"Command: {cmd}\n", file=pegfh) # Note: want blank line
497 process = subprocess.run(shlex.split(cmd), shell=False, stdout=pegfh,
498 stderr=subprocess.STDOUT, check=False)
499 if process.returncode != 0:
500 print(f"Error trying to generate Pegasus files. See {pegout}.")
501 raise RuntimeError(f"pegasus-plan exited with non-zero exit code ({process.returncode})")
503 # Grab run id from pegasus-plan output and save
504 with open(pegout, "r") as pegfh:
505 for line in pegfh:
506 match = re.search(r"pegasus-run\s+(\S+)", line)
507 if match:
508 self.run_id = match.group(1)
509 break
511 # Hack - Using profile in sites.xml doesn't add run attributes to DAG
512 # submission file. So adding them here:
513 if run_attr is not None:
514 subname = f"{self.run_id}/{self.name}-0.dag.condor.sub"
515 shutil.copyfile(subname, subname + ".orig")
516 with open(subname + ".orig", "r") as infh:
517 with open(subname, "w") as outfh:
518 for line in infh:
519 line = line.strip()
520 if line == "queue":
521 htc_write_attribs(outfh, run_attr)
522 htc_write_attribs(outfh, {"bps_job_label": "DAG"})
523 print(line, file=outfh)
525 def _write_properties_file(self, out_prefix, filenames):
526 """Write Pegasus Properties File.
528 Parameters
529 ----------
530 out_prefix : `str`
531 Directory prefix for properties file.
532 filenames : `dict` [`str`, `str`]
533 Mapping of Pegasus file keys to filenames.
535 Returns
536 -------
537 properties : `str`
538 Filename of the pegasus properties file.
539 """
540 properties = f"{self.name}_pegasus.properties"
541 if out_prefix is not None:
542 properties = os.path.join(out_prefix, properties)
543 with open(properties, "w") as outfh:
544 print("# This tells Pegasus where to find the Site Catalog.", file=outfh)
545 print(f"pegasus.catalog.site.file={filenames['sites']}", file=outfh)
547 print("# This tells Pegasus where to find the Replica Catalog.", file=outfh)
548 print(f"pegasus.catalog.replica.file={filenames['replica']}", file=outfh)
550 print("# This tells Pegasus where to find the Transformation Catalog.", file=outfh)
551 print("pegasus.catalog.transformation=Text", file=outfh)
552 print(f"pegasus.catalog.transformation.file={filenames['transformation']}", file=outfh)
554 print("# Run Pegasus in shared file system mode.", file=outfh)
555 print("pegasus.data.configuration=sharedfs", file=outfh)
557 print("# Make Pegasus use links instead of transferring files.", file=outfh)
558 print("pegasus.transfer.*.impl=Transfer", file=outfh)
559 print("pegasus.transfer.links=true", file=outfh)
561 return properties