Coverage for python/lsst/ctrl/bps/wms/pegasus/pegasus_service.py : 3%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_bps.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22"""Support for using Pegasus WMS.
23"""
25__all__ = ["PegasusService", "PegasusWorkflow"]
28import os
29import copy
30import re
31import subprocess
32import shlex
33import shutil
34import logging
36from Pegasus.DAX3 import ADAG, File, Job, Link, PFN, Executable, Profile, Namespace
37from Pegasus.catalogs import replica_catalog, sites_catalog, transformation_catalog
39from ... import BaseWmsService, BaseWmsWorkflow
40from ...bps_utils import chdir
41from ..htcondor import HTCondorService, htc_write_attribs
44_LOG = logging.getLogger(__name__)
47class PegasusService(BaseWmsService):
48 """Pegasus version of workflow engine.
49 """
50 def prepare(self, config, generic_workflow, out_prefix=None):
51 """Create submission for a generic workflow in a specific WMS.
53 Parameters
54 ----------
55 config : `lsst.ctrl.bps.BpsConfig`
56 BPS configuration that includes necessary submit/runtime
57 information.
58 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
59 The generic workflow (e.g., has executable name and arguments)
60 out_prefix : `str`
61 The root directory into which all WMS-specific files are written.
63 Returns
64 ----------
65 peg_workflow : `lsst.ctrl.bps.wms.pegasus.PegasusWorkflow`
66 A workflow ready for Pegasus to run.
67 """
68 service_class = f"{self.__class__.__module__}.{self.__class__.__name__}"
69 peg_workflow = PegasusWorkflow.from_generic_workflow(config, generic_workflow, out_prefix,
70 service_class)
71 peg_workflow.write(out_prefix)
72 peg_workflow.run_pegasus_plan(out_prefix, generic_workflow.run_attrs)
73 return peg_workflow
75 def submit(self, workflow):
76 """Submit a single WMS workflow
78 Parameters
79 ----------
80 workflow : `lsst.ctrl.bps.BaseWorkflow`
81 A single HTCondor workflow to submit
82 """
83 with chdir(workflow.submit_path):
84 _LOG.info("Submitting from directory: %s", os.getcwd())
85 command = f"pegasus-run {workflow.run_id}"
86 with open(f"{workflow.name}_pegasus-run.out", "w") as outfh:
87 process = subprocess.Popen(shlex.split(command), shell=False, stdout=outfh,
88 stderr=subprocess.STDOUT)
89 process.wait()
91 if process.returncode != 0:
92 raise RuntimeError("pegasus-run exited with non-zero exit code (%s)" % process.returncode)
94 # Note:
95 #
96 # No need to save run id as the same as the run id generated when
97 # running pegasus-plan earlier.
99 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=True):
100 """Query WMS for list of submitted WMS workflows/jobs.
102 This should be a quick lookup function to create list of jobs for
103 other functions.
105 Parameters
106 ----------
107 wms_id : `int` or `str`, optional
108 Id or path that can be used by WMS service to look up job.
109 user : `str`, optional
110 User whose submitted jobs should be listed.
111 require_bps : `bool`, optional
112 Whether to require jobs returned in list to be bps-submitted jobs.
113 pass_thru : `str`, optional
114 Information to pass through to WMS.
115 is_global : `bool`, optional
116 If set, all job queues (and their histories) will be queried for
117 job information. Defaults to False which means that only the local
118 job queue will be queried.
120 Returns
121 -------
122 job_ids : `list` [`Any`]
123 Only job ids to be used by cancel and other functions. Typically
124 this means top-level jobs (i.e., not children jobs).
125 """
126 htc_service = HTCondorService(self.config)
127 return htc_service.list_submitted_jobs(wms_id, user, require_bps, pass_thru, is_global)
129 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=True):
130 """Query WMS for status of submitted WMS workflows
131 Parameters
132 ----------
133 wms_workflow_id : `int` or `str`, optional
134 Id that can be used by WMS service to look up status.
135 user : `str`, optional
136 Limit report to submissions by this particular user
137 hist : `int`, optional
138 Number of days to expand report to include finished WMS workflows.
139 pass_thru : `str`, optional
140 Additional arguments to pass through to the specific WMS service.
141 is_global : `bool`, optional
142 If set, all job queues (and their histories) will be queried for
143 job information. Defaults to False which means that only the local
144 job queue will be queried.
146 Returns
147 -------
148 run_reports : `list` [`lsst.ctrl.bps.BaseWmsReport`]
149 Status information for submitted WMS workflows
150 message : `str`
151 Message to user on how to find more status information specific to
152 WMS.
153 """
154 htc_service = HTCondorService(self.config)
155 return htc_service.report(wms_workflow_id, user, hist, pass_thru, is_global)
157 def cancel(self, wms_id, pass_thru=None):
158 """Cancel submitted workflows/jobs.
160 Parameters
161 ----------
162 wms_id : `str`
163 ID or path of job that should be canceled.
164 pass_thru : `str`, optional
165 Information to pass through to WMS.
167 Returns
168 --------
169 deleted : `bool`
170 Whether successful deletion or not. Currently, if any doubt or any
171 individual jobs not deleted, return False.
172 message : `str`
173 Any message from WMS (e.g., error details).
174 """
175 _LOG.debug("Canceling wms_id = %s", wms_id)
177 # if wms_id is a numeric HTCondor id, use HTCondor plugin to delete
178 try:
179 float(wms_id)
180 htc_service = HTCondorService(self.config)
181 deleted, message = htc_service.cancel(wms_id, pass_thru)
182 except ValueError:
183 command = f"pegasus-remove {wms_id}"
184 _LOG.debug(command)
185 completed_process = subprocess.run(shlex.split(command), shell=False, check=False,
186 stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
187 _LOG.debug(completed_process.stdout)
188 _LOG.debug("Return code = %s", completed_process.returncode)
190 if completed_process.returncode != 0:
191 deleted = False
192 m = re.match(b"443", completed_process.stdout)
193 if m:
194 message = "no such bps job in batch queue"
195 else:
196 message = f"pegasus-remove exited with non-zero exit code {completed_process.returncode}"
197 print("XXX", completed_process.stdout.decode(), "XXX")
198 print(message)
199 else:
200 deleted = True
202 return deleted, message
205class PegasusWorkflow(BaseWmsWorkflow):
206 """Single Pegasus Workflow
208 Parameters
209 ----------
210 name : `str`
211 Name of workflow.
212 config : `lsst.ctrl.bps.BpsConfig`
213 BPS configuration that includes necessary submit/runtime information.
214 """
216 def __init__(self, name, config):
217 # config, run_id, submit_path
218 super().__init__(name, config)
219 self.dax = ADAG(name)
220 self.run_attrs = None
222 self.replica_catalog = None
223 self.sites_catalog = None
224 self.transformation_catalog = None
225 self._init_catalogs()
226 self.properties_filename = None
227 self.dax_filename = None
229 def _init_catalogs(self):
230 # Set workdir in catalogs at write time. So pass None as value here.
232 # Replica Catalog keeps mappings of logical file ids/names (LFN's) to
233 # physical file ids/names (PFN's)
234 if "rcFile" not in self.config:
235 fname = "rc.txt"
236 self.replica_catalog = replica_catalog.ReplicaCatalog(None, fname)
238 # Transformation Catalog describes all of the executables
239 # (called "transformations") used by the workflow.
240 if "tcFile" not in self.config:
241 fname = "tc.txt"
242 self.transformation_catalog = transformation_catalog.TransformationCatalog(None, fname)
244 # Note:
245 #
246 # SitesCatalog needs workdir at initialization to create local site
247 # for submit side directory where the output data from the workflow
248 # will be stored. So delaying creation of SitesCatalog until all the
249 # write function is called with a given output directory.
251 @classmethod
252 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
253 # Docstring inherited.
254 peg_workflow = cls(generic_workflow.name, config)
255 peg_workflow.run_attrs = copy.deepcopy(generic_workflow.run_attrs)
256 peg_workflow.run_attrs["bps_wms_service"] = service_class
257 peg_workflow.run_attrs["bps_wms_workflow"] = f"{cls.__module__}.{cls.__name__}"
259 # Create initial Pegasus File objects for all files that WMS must
260 # handle.
261 peg_files = {}
262 for gwf_file in generic_workflow.get_files(data=True, transfer_only=True):
263 if gwf_file.wms_transfer:
264 peg_file = File(gwf_file.name)
265 peg_file.addPFN(PFN(f"file://{gwf_file.src_uri}", "local"))
266 peg_files[gwf_file.name] = peg_file
268 # Add jobs to the DAX.
269 for job_name in generic_workflow:
270 gwf_job = generic_workflow.get_job(job_name)
271 job = peg_workflow.create_job(generic_workflow, gwf_job, peg_files)
272 peg_workflow.dax.addJob(job)
274 # Add job dependencies to the DAX.
275 for job_name in generic_workflow:
276 for child_name in generic_workflow.successors(job_name):
277 peg_workflow.dax.depends(parent=peg_workflow.dax.getJob(job_name),
278 child=peg_workflow.dax.getJob(child_name))
280 return peg_workflow
282 def create_job(self, generic_workflow, gwf_job, peg_files):
283 """Create a Pegasus job corresponding to the given GenericWorkflow job.
285 Parameters
286 ----------
287 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
288 Generic workflow that is being converted.
289 gwf_job : `lsst.ctrl.bps.GenericWorkflowJob`
290 The generic job to convert to a Pegasus job.
291 peg_files : `dict` [`str`, `Pegasus.DAX3.File`]
292 Pegasus Files needed when creating Pegasus Job.
294 Returns
295 -------
296 job : `Pegasus.DAX3.Job`
297 Pegasus job created from the generic workflow job.
299 Notes
300 -----
301 https://pegasus.isi.edu/documentation/reference-guide/variable
302 -expansion.html Says that ${VAR} gets expanded with submit side
303 values during pegasus-plan. If try $VAR (which isn't supposed to get
304 expanded by pegasus-plan), the environment variable (e.g.,
305 ${CTRL_MPEXEC_DIR} gets completely dropped from the executable path
306 and job dies because cannot find executable (/bin/pipetask).
308 So, currently Pegasus plugin only works if environment variables used
309 in commands are same on submit machine and compute machine.
310 """
311 _LOG.debug("GenericWorkflowJob=%s", gwf_job)
313 # Save transformation.
314 executable = Executable(gwf_job.executable.name,
315 installed=not gwf_job.executable.transfer_executable)
316 newexec = re.sub(r"<ENV:([^>]+)>", r"${\1}", gwf_job.executable.src_uri)
317 _LOG.debug("Executable after replacing any environment variables = %s", newexec)
318 executable.addPFN(PFN(f"file://{newexec}", gwf_job.compute_site))
319 self.transformation_catalog.add(executable)
321 # Create Pegasus Job.
322 job = Job(gwf_job.executable.name, id=gwf_job.name, node_label=gwf_job.label)
324 if gwf_job.arguments:
325 arguments = gwf_job.arguments
326 # Replace command variables
327 arguments = arguments.format(**gwf_job.cmdvals)
329 # Replace env vars
330 arguments = re.sub(r"<ENV:([^>]+)>", r"${\1}", arguments)
331 _LOG.debug("Arguments after replacing any environment variables = %s", arguments)
333 # Replace file placeholders
334 arguments = re.sub(r"<FILE:([^>]+)>", r"\1", arguments)
335 _LOG.debug("Command line arguments: %s", arguments)
337 # Break up command string into separate args for Pegasus Job object
338 # replacing file names with Pegasus File objects
339 args = arguments.split()
340 logical_file_names = list(set(peg_files) & set(args))
341 if logical_file_names:
342 indices = [args.index(lfn) for lfn in logical_file_names]
343 for idx, lfn in zip(indices, logical_file_names):
344 args[idx] = peg_files[lfn]
346 job.addArguments(*args)
347 else:
348 _LOG.warning("Job %s does not have any arguments", gwf_job.name)
350 if gwf_job.request_memory: # MB
351 job.addProfile(Profile(Namespace.CONDOR, "request_memory", gwf_job.request_memory))
352 if gwf_job.request_cpus: # cores
353 job.addProfile(Profile(Namespace.CONDOR, "request_cpus", gwf_job.request_cpus))
354 if gwf_job.request_disk: # MB
355 job.addProfile(Profile(Namespace.CONDOR, "request_disk", gwf_job.request_disk))
356 if gwf_job.priority: # MB
357 job.addProfile(Profile(Namespace.CONDOR, "priority", gwf_job.priority))
359 # Add extra job attributes
360 for key, value in gwf_job.profile.items():
361 job.addProfile(Profile(Namespace.CONDOR, key, value))
363 for key, value in gwf_job.environment.items():
364 job.addProfile(Profile(Namespace.ENV, key, value))
366 # Add run attributes
367 for key, value in self.run_attrs.items():
368 job.addProfile(Profile(Namespace.CONDOR, key=f"+{key}", value=f'"{value}"'))
370 for key, value in gwf_job.attrs.items():
371 _LOG.debug("create_job: attrs = %s", gwf_job.attrs)
372 job.addProfile(Profile(Namespace.CONDOR, key=f"+{key}", value=f'"{value}"'))
374 job.addProfile(Profile(Namespace.CONDOR, key="+bps_job_name", value=f'"{gwf_job.name}"'))
375 job.addProfile(Profile(Namespace.CONDOR, key="+bps_job_label", value=f'"{gwf_job.label}"'))
376 if "quanta_summary" in gwf_job.tags:
377 job.addProfile(Profile(Namespace.CONDOR, key="+bps_job_quanta",
378 value=f"\"{gwf_job.tags['quanta_summary']}\""))
380 # Specify job's inputs.
381 for gwf_file in generic_workflow.get_job_inputs(gwf_job.name, data=True, transfer_only=True):
382 peg_file = peg_files[gwf_file.name]
383 job.uses(peg_file, link=Link.INPUT)
384 for pfn in peg_file.pfns:
385 self.replica_catalog.add(peg_file.name, pfn.url, pfn.site)
387 # Specify job's outputs
388 for gwf_file in generic_workflow.get_job_outputs(gwf_job.name, data=True, transfer_only=True):
389 peg_file = peg_files[gwf_file.name]
390 job.uses(peg_file, link=Link.OUTPUT)
391 for pfn in peg_file.pfns:
392 self.replica_catalog.add(peg_file.name, pfn.url, pfn.site)
394 return job
396 def _define_sites(self, out_prefix):
397 """Create Pegasus Site Catalog
399 Parameters
400 ----------
401 out_prefix : `str`
402 Directory prefix for the site catalog file.
404 Notes
405 -----
406 SitesCatalog needs workdir at initialization to create local site for
407 submit side directory where the output data from the workflow will be
408 stored.
409 """
410 self.sites_catalog = sites_catalog.SitesCatalog(out_prefix, f"{self.name}_sites.xml")
412 # Adding information for all sites defined in config instead of
413 # limiting to those actually used by the workflow
414 for site, site_data in self.config["site"].items():
415 self.sites_catalog.add_site(site, arch=site_data["arch"], os=site_data["os"])
416 if "directory" in site_data:
417 # Workaround because no Python API
418 dir_dict = {}
419 for site_dir in site_data["directory"]:
420 dir_dict[site_dir] = {"path": site_data["directory"][site_dir]["path"]}
421 self.sites_catalog._sites[site]["directories"] = dir_dict
423 # add config provided site attributes
424 if "profile" in site_data:
425 for pname, pdata in site_data["profile"].items():
426 for key, val in pdata.items():
427 self.sites_catalog.add_site_profile(site, namespace=pname, key=key, value=val)
428 self.sites_catalog.add_site_profile(site, namespace=Namespace.DAGMAN, key="NODE_STATUS_FILE",
429 value=f"{self.name}.node_status")
431 def write(self, out_prefix):
432 """Write Pegasus Catalogs and DAX to files.
434 Parameters
435 ----------
436 out_prefix : `str`
437 Directory prefix for all the Pegasus workflow files.
438 """
439 self.submit_path = out_prefix
441 # filenames needed for properties file
442 filenames = {}
444 # Write down the workflow in DAX format.
445 self.dax_filename = f"{self.dax.name}.dax"
446 if out_prefix is not None:
447 os.makedirs(out_prefix, exist_ok=True)
448 self.dax_filename = os.path.join(out_prefix, self.dax_filename)
449 with open(self.dax_filename, "w") as outfh:
450 self.dax.writeXML(outfh)
452 # output site catalog
453 filename = f"{self.name}_sites.xml"
454 if "scFile" not in self.config:
455 self._define_sites(out_prefix)
456 self.sites_catalog.workflow_dir = out_prefix
457 self.sites_catalog.filename = filename
458 self.sites_catalog.write()
459 else:
460 shutil.copy(self.config["sitesFile"], os.path.join(self.submit_path, filename))
461 filenames["sites"] = filename
463 # output transformation catalog
464 filename = f"{self.name}_tc.txt"
465 if self.transformation_catalog is not None:
466 self.transformation_catalog.workflow_dir = out_prefix
467 self.transformation_catalog.filename = filename
468 self.transformation_catalog.write()
469 else:
470 shutil.copy(self.config["tcFile"], os.path.join(self.submit_path, filename))
471 filenames["transformation"] = filename
473 # output replica catalog
474 filename = f"{self.name}_rc.txt"
475 if self.replica_catalog is not None:
476 self.replica_catalog.workflow_dir = out_prefix
477 self.replica_catalog.filename = filename
478 self.replica_catalog.write()
479 else:
480 shutil.copy(self.config["tcFile"], os.path.join(self.submit_path, filename))
481 filenames["replica"] = filename
483 self.properties_filename = self._write_properties_file(out_prefix, filenames)
485 def run_pegasus_plan(self, out_prefix, run_attr):
486 """Execute pegasus-plan to convert DAX to HTCondor DAG for submission.
488 Parameters
489 ----------
490 out_prefix : `str`
491 Root directory in which to output all files.
492 run_attr : `dict`
493 Attributes to add to main DAG.
494 """
495 cmd = f"pegasus-plan --verbose --conf {self.properties_filename} --dax {self.dax_filename} --dir " \
496 f"{out_prefix}/peg --cleanup none --sites {self.config['computeSite']} " \
497 f"--input-dir {out_prefix}/input --output-dir {out_prefix}/output"
498 _LOG.debug("Plan command: %s", cmd)
499 pegout = f"{self.submit_path}/{self.name}_pegasus-plan.out"
500 with chdir(self.submit_path):
501 _LOG.debug("pegasus-plan in directory: %s", os.getcwd())
502 _LOG.debug("pegasus-plan output in %s", pegout)
503 with open(pegout, "w") as pegfh:
504 print(f"Command: {cmd}\n", file=pegfh) # Note: want blank line
505 process = subprocess.run(shlex.split(cmd), shell=False, stdout=pegfh,
506 stderr=subprocess.STDOUT, check=False)
507 if process.returncode != 0:
508 print(f"Error trying to generate Pegasus files. See {pegout}.")
509 raise RuntimeError(f"pegasus-plan exited with non-zero exit code ({process.returncode})")
511 # Grab run id from pegasus-plan output and save
512 with open(pegout, "r") as pegfh:
513 for line in pegfh:
514 match = re.search(r"pegasus-run\s+(\S+)", line)
515 if match:
516 self.run_id = match.group(1)
517 break
519 # Hack - Using profile in sites.xml doesn't add run attributes to DAG
520 # submission file. So adding them here:
521 if run_attr is not None:
522 subname = f"{self.run_id}/{self.name}-0.dag.condor.sub"
523 shutil.copyfile(subname, subname + ".orig")
524 with open(subname + ".orig", "r") as infh:
525 with open(subname, "w") as outfh:
526 for line in infh:
527 line = line.strip()
528 if line == "queue":
529 htc_write_attribs(outfh, run_attr)
530 htc_write_attribs(outfh, {"bps_job_label": "DAG"})
531 print(line, file=outfh)
533 def _write_properties_file(self, out_prefix, filenames):
534 """Write Pegasus Properties File.
536 Parameters
537 ----------
538 out_prefix : `str`
539 Directory prefix for properties file.
540 filenames : `dict` [`str`, `str`]
541 Mapping of Pegasus file keys to filenames.
543 Returns
544 -------
545 properties : `str`
546 Filename of the pegasus properties file.
547 """
548 properties = f"{self.name}_pegasus.properties"
549 if out_prefix is not None:
550 properties = os.path.join(out_prefix, properties)
551 with open(properties, "w") as outfh:
552 print("# This tells Pegasus where to find the Site Catalog.", file=outfh)
553 print(f"pegasus.catalog.site.file={filenames['sites']}", file=outfh)
555 print("# This tells Pegasus where to find the Replica Catalog.", file=outfh)
556 print(f"pegasus.catalog.replica.file={filenames['replica']}", file=outfh)
558 print("# This tells Pegasus where to find the Transformation Catalog.", file=outfh)
559 print("pegasus.catalog.transformation=Text", file=outfh)
560 print(f"pegasus.catalog.transformation.file={filenames['transformation']}", file=outfh)
562 print("# Run Pegasus in shared file system mode.", file=outfh)
563 print("pegasus.data.configuration=sharedfs", file=outfh)
565 print("# Make Pegasus use links instead of transferring files.", file=outfh)
566 print("pegasus.transfer.*.impl=Transfer", file=outfh)
567 print("pegasus.transfer.links=true", file=outfh)
569 return properties