Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22"""Support for using Pegasus WMS. 

23""" 

24 

25__all__ = ["PegasusService", "PegasusWorkflow"] 

26 

27 

28import os 

29import copy 

30import re 

31import subprocess 

32import shlex 

33import shutil 

34import logging 

35 

36from Pegasus.DAX3 import ADAG, File, Job, Link, PFN, Executable, Profile, Namespace 

37from Pegasus.catalogs import replica_catalog, sites_catalog, transformation_catalog 

38 

39from ... import BaseWmsService, BaseWmsWorkflow 

40from ...bps_utils import chdir 

41from ..htcondor import HTCondorService, htc_write_attribs 

42 

43 

44_LOG = logging.getLogger(__name__) 

45 

46 

47class PegasusService(BaseWmsService): 

48 """Pegasus version of workflow engine. 

49 """ 

50 def prepare(self, config, generic_workflow, out_prefix=None): 

51 """Create submission for a generic workflow in a specific WMS. 

52 

53 Parameters 

54 ---------- 

55 config : `lsst.ctrl.bps.BpsConfig` 

56 BPS configuration that includes necessary submit/runtime 

57 information. 

58 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

59 The generic workflow (e.g., has executable name and arguments) 

60 out_prefix : `str` 

61 The root directory into which all WMS-specific files are written. 

62 

63 Returns 

64 ---------- 

65 peg_workflow : `lsst.ctrl.bps.wms.pegasus.PegasusWorkflow` 

66 A workflow ready for Pegasus to run. 

67 """ 

68 service_class = f"{self.__class__.__module__}.{self.__class__.__name__}" 

69 peg_workflow = PegasusWorkflow.from_generic_workflow(config, generic_workflow, out_prefix, 

70 service_class) 

71 peg_workflow.write(out_prefix) 

72 peg_workflow.run_pegasus_plan(out_prefix, generic_workflow.run_attrs) 

73 return peg_workflow 

74 

75 def submit(self, workflow): 

76 """Submit a single WMS workflow 

77 

78 Parameters 

79 ---------- 

80 workflow : `lsst.ctrl.bps.BaseWorkflow` 

81 A single HTCondor workflow to submit 

82 """ 

83 with chdir(workflow.submit_path): 

84 _LOG.info("Submitting from directory: %s", os.getcwd()) 

85 command = f"pegasus-run {workflow.run_id}" 

86 with open(f"{workflow.name}_pegasus-run.out", "w") as outfh: 

87 process = subprocess.Popen(shlex.split(command), shell=False, stdout=outfh, 

88 stderr=subprocess.STDOUT) 

89 process.wait() 

90 

91 if process.returncode != 0: 

92 raise RuntimeError("pegasus-run exited with non-zero exit code (%s)" % process.returncode) 

93 

94 # Note: 

95 # 

96 # No need to save run id as the same as the run id generated when 

97 # running pegasus-plan earlier. 

98 

99 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None): 

100 """Query WMS for list of submitted WMS workflows/jobs. 

101 

102 This should be a quick lookup function to create list of jobs for 

103 other functions. 

104 

105 Parameters 

106 ---------- 

107 wms_id : `int` or `str`, optional 

108 Id or path that can be used by WMS service to look up job. 

109 user : `str`, optional 

110 User whose submitted jobs should be listed. 

111 require_bps : `bool`, optional 

112 Whether to require jobs returned in list to be bps-submitted jobs. 

113 pass_thru : `str`, optional 

114 Information to pass through to WMS. 

115 

116 Returns 

117 ------- 

118 job_ids : `list` [`Any`] 

119 Only job ids to be used by cancel and other functions. Typically 

120 this means top-level jobs (i.e., not children jobs). 

121 """ 

122 htc_service = HTCondorService(self.config) 

123 return htc_service.list_submitted_jobs(wms_id, user, require_bps, pass_thru) 

124 

125 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None): 

126 """Query WMS for status of submitted WMS workflows 

127 Parameters 

128 ---------- 

129 wms_workflow_id : `int` or `str`, optional 

130 Id that can be used by WMS service to look up status. 

131 user : `str`, optional 

132 Limit report to submissions by this particular user 

133 hist : `int`, optional 

134 Number of days to expand report to include finished WMS workflows. 

135 pass_thru : `str`, optional 

136 Additional arguments to pass through to the specific WMS service. 

137 

138 Returns 

139 ------- 

140 run_reports : `list` [`lsst.ctrl.bps.BaseWmsReport`] 

141 Status information for submitted WMS workflows 

142 message : `str` 

143 Message to user on how to find more status information specific to 

144 WMS. 

145 """ 

146 htc_service = HTCondorService(self.config) 

147 return htc_service.report(wms_workflow_id, user, hist, pass_thru) 

148 

149 def cancel(self, wms_id, pass_thru=None): 

150 """Cancel submitted workflows/jobs. 

151 

152 Parameters 

153 ---------- 

154 wms_id : `str` 

155 ID or path of job that should be canceled. 

156 pass_thru : `str`, optional 

157 Information to pass through to WMS. 

158 

159 Returns 

160 -------- 

161 deleted : `bool` 

162 Whether successful deletion or not. Currently, if any doubt or any 

163 individual jobs not deleted, return False. 

164 message : `str` 

165 Any message from WMS (e.g., error details). 

166 """ 

167 _LOG.debug("Canceling wms_id = %s", wms_id) 

168 

169 # if wms_id is a numeric HTCondor id, use HTCondor plugin to delete 

170 try: 

171 float(wms_id) 

172 htc_service = HTCondorService(self.config) 

173 deleted, message = htc_service.cancel(wms_id, pass_thru) 

174 except ValueError: 

175 command = f"pegasus-remove {wms_id}" 

176 _LOG.debug(command) 

177 completed_process = subprocess.run(shlex.split(command), shell=False, check=False, 

178 stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 

179 _LOG.debug(completed_process.stdout) 

180 _LOG.debug("Return code = %s", completed_process.returncode) 

181 

182 if completed_process.returncode != 0: 

183 deleted = False 

184 m = re.match(b"443", completed_process.stdout) 

185 if m: 

186 message = "no such bps job in batch queue" 

187 else: 

188 message = f"pegasus-remove exited with non-zero exit code {completed_process.returncode}" 

189 print("XXX", completed_process.stdout.decode(), "XXX") 

190 print(message) 

191 else: 

192 deleted = True 

193 

194 return deleted, message 

195 

196 

197class PegasusWorkflow(BaseWmsWorkflow): 

198 """Single Pegasus Workflow 

199 

200 Parameters 

201 ---------- 

202 name : `str` 

203 Name of workflow. 

204 config : `lsst.ctrl.bps.BpsConfig` 

205 BPS configuration that includes necessary submit/runtime information. 

206 """ 

207 

208 def __init__(self, name, config): 

209 # config, run_id, submit_path 

210 super().__init__(name, config) 

211 self.dax = ADAG(name) 

212 self.run_attrs = None 

213 

214 self.replica_catalog = None 

215 self.sites_catalog = None 

216 self.transformation_catalog = None 

217 self._init_catalogs() 

218 self.properties_filename = None 

219 self.dax_filename = None 

220 

221 def _init_catalogs(self): 

222 # Set workdir in catalogs at write time. So pass None as value here. 

223 

224 # Replica Catalog keeps mappings of logical file ids/names (LFN's) to 

225 # physical file ids/names (PFN's) 

226 if "rcFile" not in self.config: 

227 fname = "rc.txt" 

228 self.replica_catalog = replica_catalog.ReplicaCatalog(None, fname) 

229 

230 # Transformation Catalog describes all of the executables 

231 # (called "transformations") used by the workflow. 

232 if "tcFile" not in self.config: 

233 fname = "tc.txt" 

234 self.transformation_catalog = transformation_catalog.TransformationCatalog(None, fname) 

235 

236 # Note: 

237 # 

238 # SitesCatalog needs workdir at initialization to create local site 

239 # for submit side directory where the output data from the workflow 

240 # will be stored. So delaying creation of SitesCatalog until all the 

241 # write function is called with a given output directory. 

242 

243 @classmethod 

244 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

245 # Docstring inherited. 

246 peg_workflow = cls(generic_workflow.name, config) 

247 peg_workflow.run_attrs = copy.deepcopy(generic_workflow.run_attrs) 

248 peg_workflow.run_attrs["bps_wms_service"] = service_class 

249 peg_workflow.run_attrs["bps_wms_workflow"] = f"{cls.__module__}.{cls.__name__}" 

250 

251 # Create initial Pegasus File objects for all files that WMS must 

252 # handle. 

253 peg_files = {} 

254 for gwf_file in generic_workflow.get_files(data=True, transfer_only=True): 

255 if gwf_file.wms_transfer: 

256 peg_file = File(gwf_file.name) 

257 peg_file.addPFN(PFN(f"file://{gwf_file.src_uri}", "local")) 

258 peg_files[gwf_file.name] = peg_file 

259 

260 # Add jobs to the DAX. 

261 for job_name in generic_workflow: 

262 gwf_job = generic_workflow.get_job(job_name) 

263 job = peg_workflow.create_job(generic_workflow, gwf_job, peg_files) 

264 peg_workflow.dax.addJob(job) 

265 

266 # Add job dependencies to the DAX. 

267 for job_name in generic_workflow: 

268 for child_name in generic_workflow.successors(job_name): 

269 peg_workflow.dax.depends(parent=peg_workflow.dax.getJob(job_name), 

270 child=peg_workflow.dax.getJob(child_name)) 

271 

272 return peg_workflow 

273 

274 def create_job(self, generic_workflow, gwf_job, peg_files): 

275 """Create a Pegasus job corresponding to the given GenericWorkflow job. 

276 

277 Parameters 

278 ---------- 

279 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

280 Generic workflow that is being converted. 

281 gwf_job : `lsst.ctrl.bps.GenericWorkflowJob` 

282 The generic job to convert to a Pegasus job. 

283 peg_files : `dict` [`str`, `Pegasus.DAX3.File`] 

284 Pegasus Files needed when creating Pegasus Job. 

285 

286 Returns 

287 ------- 

288 job : `Pegasus.DAX3.Job` 

289 Pegasus job created from the generic workflow job. 

290 

291 Notes 

292 ----- 

293 https://pegasus.isi.edu/documentation/reference-guide/variable 

294 -expansion.html Says that ${VAR} gets expanded with submit side 

295 values during pegasus-plan. If try $VAR (which isn't supposed to get 

296 expanded by pegasus-plan), the environment variable (e.g., 

297 ${CTRL_MPEXEC_DIR} gets completely dropped from the executable path 

298 and job dies because cannot find executable (/bin/pipetask). 

299 

300 So, currently Pegasus plugin only works if environment variables used 

301 in commands are same on submit machine and compute machine. 

302 """ 

303 _LOG.debug("GenericWorkflowJob=%s", gwf_job) 

304 _LOG.debug("%s gwf_job.cmdline = %s", gwf_job.name, gwf_job.cmdline) 

305 cmd_parts = gwf_job.cmdline.split(" ", 1) 

306 

307 # Save transformation. 

308 executable = Executable(os.path.basename(cmd_parts[0]), installed=True) 

309 newexec = re.sub(r"<ENV:([^>]+)>", r"${\1}", cmd_parts[0]) 

310 _LOG.debug("Executable after replacing any environment variables = %s", newexec) 

311 executable.addPFN(PFN(f"file://{newexec}", gwf_job.compute_site)) 

312 self.transformation_catalog.add(executable) 

313 

314 # Create Pegasus Job. 

315 job = Job(os.path.basename(cmd_parts[0]), id=gwf_job.name, node_label=gwf_job.label) 

316 

317 if len(cmd_parts) > 1: 

318 arguments = cmd_parts[1] 

319 # Replace command variables 

320 arguments = arguments.format(**gwf_job.cmdvals) 

321 

322 # Replace env vars 

323 arguments = re.sub(r"<ENV:([^>]+)>", r"${\1}", arguments) 

324 _LOG.debug("Arguments after replacing any environment variables = %s", arguments) 

325 

326 # Replace file placeholders 

327 arguments = re.sub(r"<FILE:([^>]+)>", r"\1", arguments) 

328 _LOG.debug("Command line arguments: %s", arguments) 

329 

330 # Break up command string into separate args for Pegasus Job object 

331 # replacing file names with Pegasus File objects 

332 args = arguments.split() 

333 logical_file_names = list(set(peg_files) & set(args)) 

334 if logical_file_names: 

335 indices = [args.index(lfn) for lfn in logical_file_names] 

336 for idx, lfn in zip(indices, logical_file_names): 

337 args[idx] = peg_files[lfn] 

338 

339 job.addArguments(*args) 

340 else: 

341 _LOG.warning("Job %s does not have any arguments", gwf_job.name) 

342 

343 if gwf_job.request_memory: # MB 

344 job.addProfile(Profile(Namespace.CONDOR, "request_memory", gwf_job.request_memory)) 

345 if gwf_job.request_cpus: # cores 

346 job.addProfile(Profile(Namespace.CONDOR, "request_cpus", gwf_job.request_cpus)) 

347 if gwf_job.request_disk: # MB 

348 job.addProfile(Profile(Namespace.CONDOR, "request_disk", gwf_job.request_disk)) 

349 if gwf_job.priority: # MB 

350 job.addProfile(Profile(Namespace.CONDOR, "priority", gwf_job.priority)) 

351 

352 # Add extra job attributes 

353 for key, value in gwf_job.profile.items(): 

354 job.addProfile(Profile(Namespace.CONDOR, key, value)) 

355 

356 for key, value in gwf_job.environment.items(): 

357 job.addProfile(Profile(Namespace.ENV, key, value)) 

358 

359 # Add run attributes 

360 for key, value in self.run_attrs.items(): 

361 job.addProfile(Profile(Namespace.CONDOR, key=f"+{key}", value=f'"{value}"')) 

362 

363 for key, value in gwf_job.attrs.items(): 

364 _LOG.debug("create_job: attrs = %s", gwf_job.attrs) 

365 job.addProfile(Profile(Namespace.CONDOR, key=f"+{key}", value=f'"{value}"')) 

366 

367 job.addProfile(Profile(Namespace.CONDOR, key="+bps_job_name", value=f'"{gwf_job.name}"')) 

368 job.addProfile(Profile(Namespace.CONDOR, key="+bps_job_label", value=f'"{gwf_job.label}"')) 

369 if "quanta_summary" in gwf_job.tags: 

370 job.addProfile(Profile(Namespace.CONDOR, key="+bps_job_quanta", 

371 value=f"\"{gwf_job.tags['quanta_summary']}\"")) 

372 

373 # Specify job's inputs. 

374 for gwf_file in generic_workflow.get_job_inputs(gwf_job.name, data=True, transfer_only=True): 

375 peg_file = peg_files[gwf_file.name] 

376 job.uses(peg_file, link=Link.INPUT) 

377 for pfn in peg_file.pfns: 

378 self.replica_catalog.add(peg_file.name, pfn.url, pfn.site) 

379 

380 # Specify job's outputs 

381 for gwf_file in generic_workflow.get_job_outputs(gwf_job.name, data=True, transfer_only=True): 

382 peg_file = peg_files[gwf_file.name] 

383 job.uses(peg_file, link=Link.OUTPUT) 

384 for pfn in peg_file.pfns: 

385 self.replica_catalog.add(peg_file.name, pfn.url, pfn.site) 

386 

387 return job 

388 

389 def _define_sites(self, out_prefix): 

390 """Create Pegasus Site Catalog 

391 

392 Parameters 

393 ---------- 

394 out_prefix : `str` 

395 Directory prefix for the site catalog file. 

396 

397 Notes 

398 ----- 

399 SitesCatalog needs workdir at initialization to create local site for 

400 submit side directory where the output data from the workflow will be 

401 stored. 

402 """ 

403 self.sites_catalog = sites_catalog.SitesCatalog(out_prefix, f"{self.name}_sites.xml") 

404 

405 # Adding information for all sites defined in config instead of 

406 # limiting to those actually used by the workflow 

407 for site, site_data in self.config["site"].items(): 

408 self.sites_catalog.add_site(site, arch=site_data["arch"], os=site_data["os"]) 

409 if "directory" in site_data: 

410 # Workaround because no Python API 

411 dir_dict = {} 

412 for site_dir in site_data["directory"]: 

413 dir_dict[site_dir] = {"path": site_data["directory"][site_dir]["path"]} 

414 self.sites_catalog._sites[site]["directories"] = dir_dict 

415 

416 # add config provided site attributes 

417 if "profile" in site_data: 

418 for pname, pdata in site_data["profile"].items(): 

419 for key, val in pdata.items(): 

420 self.sites_catalog.add_site_profile(site, namespace=pname, key=key, value=val) 

421 self.sites_catalog.add_site_profile(site, namespace=Namespace.DAGMAN, key="NODE_STATUS_FILE", 

422 value=f"{self.name}.node_status") 

423 

424 def write(self, out_prefix): 

425 """Write Pegasus Catalogs and DAX to files. 

426 

427 Parameters 

428 ---------- 

429 out_prefix : `str` 

430 Directory prefix for all the Pegasus workflow files. 

431 """ 

432 self.submit_path = out_prefix 

433 

434 # filenames needed for properties file 

435 filenames = {} 

436 

437 # Write down the workflow in DAX format. 

438 self.dax_filename = f"{self.dax.name}.dax" 

439 if out_prefix is not None: 

440 os.makedirs(out_prefix, exist_ok=True) 

441 self.dax_filename = os.path.join(out_prefix, self.dax_filename) 

442 with open(self.dax_filename, "w") as outfh: 

443 self.dax.writeXML(outfh) 

444 

445 # output site catalog 

446 filename = f"{self.name}_sites.xml" 

447 if "scFile" not in self.config: 

448 self._define_sites(out_prefix) 

449 self.sites_catalog.workflow_dir = out_prefix 

450 self.sites_catalog.filename = filename 

451 self.sites_catalog.write() 

452 else: 

453 shutil.copy(self.config["sitesFile"], os.path.join(self.submit_path, filename)) 

454 filenames["sites"] = filename 

455 

456 # output transformation catalog 

457 filename = f"{self.name}_tc.txt" 

458 if self.transformation_catalog is not None: 

459 self.transformation_catalog.workflow_dir = out_prefix 

460 self.transformation_catalog.filename = filename 

461 self.transformation_catalog.write() 

462 else: 

463 shutil.copy(self.config["tcFile"], os.path.join(self.submit_path, filename)) 

464 filenames["transformation"] = filename 

465 

466 # output replica catalog 

467 filename = f"{self.name}_rc.txt" 

468 if self.replica_catalog is not None: 

469 self.replica_catalog.workflow_dir = out_prefix 

470 self.replica_catalog.filename = filename 

471 self.replica_catalog.write() 

472 else: 

473 shutil.copy(self.config["tcFile"], os.path.join(self.submit_path, filename)) 

474 filenames["replica"] = filename 

475 

476 self.properties_filename = self._write_properties_file(out_prefix, filenames) 

477 

478 def run_pegasus_plan(self, out_prefix, run_attr): 

479 """Execute pegasus-plan to convert DAX to HTCondor DAG for submission. 

480 

481 Parameters 

482 ---------- 

483 out_prefix : `str` 

484 Root directory in which to output all files. 

485 run_attr : `dict` 

486 Attributes to add to main DAG. 

487 """ 

488 cmd = f"pegasus-plan --verbose --conf {self.properties_filename} --dax {self.dax_filename} --dir " \ 

489 f"{out_prefix}/peg --cleanup none --sites {self.config['computeSite']} " \ 

490 f"--input-dir {out_prefix}/input --output-dir {out_prefix}/output" 

491 _LOG.debug("Plan command: %s", cmd) 

492 pegout = f"{self.submit_path}/{self.name}_pegasus-plan.out" 

493 with chdir(self.submit_path): 

494 _LOG.debug("pegasus-plan in directory: %s", os.getcwd()) 

495 _LOG.debug("pegasus-plan output in %s", pegout) 

496 with open(pegout, "w") as pegfh: 

497 print(f"Command: {cmd}\n", file=pegfh) # Note: want blank line 

498 process = subprocess.run(shlex.split(cmd), shell=False, stdout=pegfh, 

499 stderr=subprocess.STDOUT, check=False) 

500 if process.returncode != 0: 

501 print(f"Error trying to generate Pegasus files. See {pegout}.") 

502 raise RuntimeError(f"pegasus-plan exited with non-zero exit code ({process.returncode})") 

503 

504 # Grab run id from pegasus-plan output and save 

505 with open(pegout, "r") as pegfh: 

506 for line in pegfh: 

507 match = re.search(r"pegasus-run\s+(\S+)", line) 

508 if match: 

509 self.run_id = match.group(1) 

510 break 

511 

512 # Hack - Using profile in sites.xml doesn't add run attributes to DAG 

513 # submission file. So adding them here: 

514 if run_attr is not None: 

515 subname = f"{self.run_id}/{self.name}-0.dag.condor.sub" 

516 shutil.copyfile(subname, subname + ".orig") 

517 with open(subname + ".orig", "r") as infh: 

518 with open(subname, "w") as outfh: 

519 for line in infh: 

520 line = line.strip() 

521 if line == "queue": 

522 htc_write_attribs(outfh, run_attr) 

523 htc_write_attribs(outfh, {"bps_job_label": "DAG"}) 

524 print(line, file=outfh) 

525 

526 def _write_properties_file(self, out_prefix, filenames): 

527 """Write Pegasus Properties File. 

528 

529 Parameters 

530 ---------- 

531 out_prefix : `str` 

532 Directory prefix for properties file. 

533 filenames : `dict` [`str`, `str`] 

534 Mapping of Pegasus file keys to filenames. 

535 

536 Returns 

537 ------- 

538 properties : `str` 

539 Filename of the pegasus properties file. 

540 """ 

541 properties = f"{self.name}_pegasus.properties" 

542 if out_prefix is not None: 

543 properties = os.path.join(out_prefix, properties) 

544 with open(properties, "w") as outfh: 

545 print("# This tells Pegasus where to find the Site Catalog.", file=outfh) 

546 print(f"pegasus.catalog.site.file={filenames['sites']}", file=outfh) 

547 

548 print("# This tells Pegasus where to find the Replica Catalog.", file=outfh) 

549 print(f"pegasus.catalog.replica.file={filenames['replica']}", file=outfh) 

550 

551 print("# This tells Pegasus where to find the Transformation Catalog.", file=outfh) 

552 print("pegasus.catalog.transformation=Text", file=outfh) 

553 print(f"pegasus.catalog.transformation.file={filenames['transformation']}", file=outfh) 

554 

555 print("# Run Pegasus in shared file system mode.", file=outfh) 

556 print("pegasus.data.configuration=sharedfs", file=outfh) 

557 

558 print("# Make Pegasus use links instead of transferring files.", file=outfh) 

559 print("pegasus.transfer.*.impl=Transfer", file=outfh) 

560 print("pegasus.transfer.links=true", file=outfh) 

561 

562 return properties