Coverage for python/lsst/ctrl/bps/wms/pegasus/pegasus_service.py: 3%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

246 statements  

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22"""Support for using Pegasus WMS. 

23""" 

24 

25__all__ = ["PegasusService", "PegasusWorkflow"] 

26 

27 

28import os 

29import copy 

30import re 

31import subprocess 

32import shlex 

33import shutil 

34import logging 

35 

36from Pegasus.DAX3 import ADAG, File, Job, Link, PFN, Executable, Profile, Namespace 

37from Pegasus.catalogs import replica_catalog, sites_catalog, transformation_catalog 

38 

39from ... import BaseWmsService, BaseWmsWorkflow 

40from ...bps_utils import chdir 

41from ..htcondor import HTCondorService, htc_write_attribs 

42 

43 

44_LOG = logging.getLogger(__name__) 

45 

46 

47class PegasusService(BaseWmsService): 

48 """Pegasus version of workflow engine. 

49 """ 

50 def prepare(self, config, generic_workflow, out_prefix=None): 

51 """Create submission for a generic workflow in a specific WMS. 

52 

53 Parameters 

54 ---------- 

55 config : `lsst.ctrl.bps.BpsConfig` 

56 BPS configuration that includes necessary submit/runtime 

57 information. 

58 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

59 The generic workflow (e.g., has executable name and arguments) 

60 out_prefix : `str` 

61 The root directory into which all WMS-specific files are written. 

62 

63 Returns 

64 ---------- 

65 peg_workflow : `lsst.ctrl.bps.wms.pegasus.PegasusWorkflow` 

66 A workflow ready for Pegasus to run. 

67 """ 

68 service_class = f"{self.__class__.__module__}.{self.__class__.__name__}" 

69 peg_workflow = PegasusWorkflow.from_generic_workflow(config, generic_workflow, out_prefix, 

70 service_class) 

71 peg_workflow.write(out_prefix) 

72 peg_workflow.run_pegasus_plan(out_prefix, generic_workflow.run_attrs) 

73 return peg_workflow 

74 

75 def submit(self, workflow): 

76 """Submit a single WMS workflow 

77 

78 Parameters 

79 ---------- 

80 workflow : `lsst.ctrl.bps.BaseWorkflow` 

81 A single HTCondor workflow to submit 

82 """ 

83 with chdir(workflow.submit_path): 

84 _LOG.info("Submitting from directory: %s", os.getcwd()) 

85 command = f"pegasus-run {workflow.run_id}" 

86 with open(f"{workflow.name}_pegasus-run.out", "w") as outfh: 

87 process = subprocess.Popen(shlex.split(command), shell=False, stdout=outfh, 

88 stderr=subprocess.STDOUT) 

89 process.wait() 

90 

91 if process.returncode != 0: 

92 raise RuntimeError("pegasus-run exited with non-zero exit code (%s)" % process.returncode) 

93 

94 # Note: 

95 # 

96 # No need to save run id as the same as the run id generated when 

97 # running pegasus-plan earlier. 

98 

99 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None): 

100 """Query WMS for list of submitted WMS workflows/jobs. 

101 

102 This should be a quick lookup function to create list of jobs for 

103 other functions. 

104 

105 Parameters 

106 ---------- 

107 wms_id : `int` or `str`, optional 

108 Id or path that can be used by WMS service to look up job. 

109 user : `str`, optional 

110 User whose submitted jobs should be listed. 

111 require_bps : `bool`, optional 

112 Whether to require jobs returned in list to be bps-submitted jobs. 

113 pass_thru : `str`, optional 

114 Information to pass through to WMS. 

115 

116 Returns 

117 ------- 

118 job_ids : `list` [`Any`] 

119 Only job ids to be used by cancel and other functions. Typically 

120 this means top-level jobs (i.e., not children jobs). 

121 """ 

122 htc_service = HTCondorService(self.config) 

123 return htc_service.list_submitted_jobs(wms_id, user, require_bps, pass_thru) 

124 

125 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None): 

126 """Query WMS for status of submitted WMS workflows 

127 Parameters 

128 ---------- 

129 wms_workflow_id : `int` or `str`, optional 

130 Id that can be used by WMS service to look up status. 

131 user : `str`, optional 

132 Limit report to submissions by this particular user 

133 hist : `int`, optional 

134 Number of days to expand report to include finished WMS workflows. 

135 pass_thru : `str`, optional 

136 Additional arguments to pass through to the specific WMS service. 

137 

138 Returns 

139 ------- 

140 run_reports : `list` [`lsst.ctrl.bps.BaseWmsReport`] 

141 Status information for submitted WMS workflows 

142 message : `str` 

143 Message to user on how to find more status information specific to 

144 WMS. 

145 """ 

146 htc_service = HTCondorService(self.config) 

147 return htc_service.report(wms_workflow_id, user, hist, pass_thru) 

148 

149 def cancel(self, wms_id, pass_thru=None): 

150 """Cancel submitted workflows/jobs. 

151 

152 Parameters 

153 ---------- 

154 wms_id : `str` 

155 ID or path of job that should be canceled. 

156 pass_thru : `str`, optional 

157 Information to pass through to WMS. 

158 

159 Returns 

160 -------- 

161 deleted : `bool` 

162 Whether successful deletion or not. Currently, if any doubt or any 

163 individual jobs not deleted, return False. 

164 message : `str` 

165 Any message from WMS (e.g., error details). 

166 """ 

167 _LOG.debug("Canceling wms_id = %s", wms_id) 

168 

169 # if wms_id is a numeric HTCondor id, use HTCondor plugin to delete 

170 try: 

171 float(wms_id) 

172 htc_service = HTCondorService(self.config) 

173 deleted, message = htc_service.cancel(wms_id, pass_thru) 

174 except ValueError: 

175 command = f"pegasus-remove {wms_id}" 

176 _LOG.debug(command) 

177 completed_process = subprocess.run(shlex.split(command), shell=False, check=False, 

178 stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 

179 _LOG.debug(completed_process.stdout) 

180 _LOG.debug("Return code = %s", completed_process.returncode) 

181 

182 if completed_process.returncode != 0: 

183 deleted = False 

184 m = re.match(b"443", completed_process.stdout) 

185 if m: 

186 message = "no such bps job in batch queue" 

187 else: 

188 message = f"pegasus-remove exited with non-zero exit code {completed_process.returncode}" 

189 print("XXX", completed_process.stdout.decode(), "XXX") 

190 print(message) 

191 else: 

192 deleted = True 

193 

194 return deleted, message 

195 

196 

197class PegasusWorkflow(BaseWmsWorkflow): 

198 """Single Pegasus Workflow 

199 

200 Parameters 

201 ---------- 

202 name : `str` 

203 Name of workflow. 

204 config : `lsst.ctrl.bps.BpsConfig` 

205 BPS configuration that includes necessary submit/runtime information. 

206 """ 

207 

208 def __init__(self, name, config): 

209 # config, run_id, submit_path 

210 super().__init__(name, config) 

211 self.dax = ADAG(name) 

212 self.run_attrs = None 

213 

214 self.replica_catalog = None 

215 self.sites_catalog = None 

216 self.transformation_catalog = None 

217 self._init_catalogs() 

218 self.properties_filename = None 

219 self.dax_filename = None 

220 

221 def _init_catalogs(self): 

222 # Set workdir in catalogs at write time. So pass None as value here. 

223 

224 # Replica Catalog keeps mappings of logical file ids/names (LFN's) to 

225 # physical file ids/names (PFN's) 

226 if "rcFile" not in self.config: 

227 fname = "rc.txt" 

228 self.replica_catalog = replica_catalog.ReplicaCatalog(None, fname) 

229 

230 # Transformation Catalog describes all of the executables 

231 # (called "transformations") used by the workflow. 

232 if "tcFile" not in self.config: 

233 fname = "tc.txt" 

234 self.transformation_catalog = transformation_catalog.TransformationCatalog(None, fname) 

235 

236 # Note: 

237 # 

238 # SitesCatalog needs workdir at initialization to create local site 

239 # for submit side directory where the output data from the workflow 

240 # will be stored. So delaying creation of SitesCatalog until all the 

241 # write function is called with a given output directory. 

242 

243 @classmethod 

244 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

245 # Docstring inherited. 

246 peg_workflow = cls(generic_workflow.name, config) 

247 peg_workflow.run_attrs = copy.deepcopy(generic_workflow.run_attrs) 

248 peg_workflow.run_attrs["bps_wms_service"] = service_class 

249 peg_workflow.run_attrs["bps_wms_workflow"] = f"{cls.__module__}.{cls.__name__}" 

250 

251 # Create initial Pegasus File objects for all files that WMS must 

252 # handle. 

253 peg_files = {} 

254 for gwf_file in generic_workflow.get_files(data=True, transfer_only=True): 

255 if gwf_file.wms_transfer: 

256 peg_file = File(gwf_file.name) 

257 peg_file.addPFN(PFN(f"file://{gwf_file.src_uri}", "local")) 

258 peg_files[gwf_file.name] = peg_file 

259 

260 # Add jobs to the DAX. 

261 for job_name in generic_workflow: 

262 gwf_job = generic_workflow.get_job(job_name) 

263 job = peg_workflow.create_job(generic_workflow, gwf_job, peg_files) 

264 peg_workflow.dax.addJob(job) 

265 

266 # Add job dependencies to the DAX. 

267 for job_name in generic_workflow: 

268 for child_name in generic_workflow.successors(job_name): 

269 peg_workflow.dax.depends(parent=peg_workflow.dax.getJob(job_name), 

270 child=peg_workflow.dax.getJob(child_name)) 

271 

272 return peg_workflow 

273 

274 def create_job(self, generic_workflow, gwf_job, peg_files): 

275 """Create a Pegasus job corresponding to the given GenericWorkflow job. 

276 

277 Parameters 

278 ---------- 

279 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

280 Generic workflow that is being converted. 

281 gwf_job : `lsst.ctrl.bps.GenericWorkflowJob` 

282 The generic job to convert to a Pegasus job. 

283 peg_files : `dict` [`str`, `Pegasus.DAX3.File`] 

284 Pegasus Files needed when creating Pegasus Job. 

285 

286 Returns 

287 ------- 

288 job : `Pegasus.DAX3.Job` 

289 Pegasus job created from the generic workflow job. 

290 

291 Notes 

292 ----- 

293 https://pegasus.isi.edu/documentation/reference-guide/variable 

294 -expansion.html Says that ${VAR} gets expanded with submit side 

295 values during pegasus-plan. If try $VAR (which isn't supposed to get 

296 expanded by pegasus-plan), the environment variable (e.g., 

297 ${CTRL_MPEXEC_DIR} gets completely dropped from the executable path 

298 and job dies because cannot find executable (/bin/pipetask). 

299 

300 So, currently Pegasus plugin only works if environment variables used 

301 in commands are same on submit machine and compute machine. 

302 """ 

303 _LOG.debug("GenericWorkflowJob=%s", gwf_job) 

304 

305 # Save transformation. 

306 executable = Executable(gwf_job.executable.name, 

307 installed=not gwf_job.executable.transfer_executable) 

308 newexec = re.sub(r"<ENV:([^>]+)>", r"${\1}", gwf_job.executable.src_uri) 

309 _LOG.debug("Executable after replacing any environment variables = %s", newexec) 

310 executable.addPFN(PFN(f"file://{newexec}", gwf_job.compute_site)) 

311 self.transformation_catalog.add(executable) 

312 

313 # Create Pegasus Job. 

314 job = Job(gwf_job.executable.name, id=gwf_job.name, node_label=gwf_job.label) 

315 

316 if gwf_job.arguments: 

317 arguments = gwf_job.arguments 

318 # Replace command variables 

319 arguments = arguments.format(**gwf_job.cmdvals) 

320 

321 # Replace env vars 

322 arguments = re.sub(r"<ENV:([^>]+)>", r"${\1}", arguments) 

323 _LOG.debug("Arguments after replacing any environment variables = %s", arguments) 

324 

325 # Replace file placeholders 

326 arguments = re.sub(r"<FILE:([^>]+)>", r"\1", arguments) 

327 _LOG.debug("Command line arguments: %s", arguments) 

328 

329 # Break up command string into separate args for Pegasus Job object 

330 # replacing file names with Pegasus File objects 

331 args = arguments.split() 

332 logical_file_names = list(set(peg_files) & set(args)) 

333 if logical_file_names: 

334 indices = [args.index(lfn) for lfn in logical_file_names] 

335 for idx, lfn in zip(indices, logical_file_names): 

336 args[idx] = peg_files[lfn] 

337 

338 job.addArguments(*args) 

339 else: 

340 _LOG.warning("Job %s does not have any arguments", gwf_job.name) 

341 

342 if gwf_job.request_memory: # MB 

343 job.addProfile(Profile(Namespace.CONDOR, "request_memory", gwf_job.request_memory)) 

344 if gwf_job.request_cpus: # cores 

345 job.addProfile(Profile(Namespace.CONDOR, "request_cpus", gwf_job.request_cpus)) 

346 if gwf_job.request_disk: # MB 

347 job.addProfile(Profile(Namespace.CONDOR, "request_disk", gwf_job.request_disk)) 

348 if gwf_job.priority: # MB 

349 job.addProfile(Profile(Namespace.CONDOR, "priority", gwf_job.priority)) 

350 

351 # Add extra job attributes 

352 for key, value in gwf_job.profile.items(): 

353 job.addProfile(Profile(Namespace.CONDOR, key, value)) 

354 

355 for key, value in gwf_job.environment.items(): 

356 job.addProfile(Profile(Namespace.ENV, key, value)) 

357 

358 # Add run attributes 

359 for key, value in self.run_attrs.items(): 

360 job.addProfile(Profile(Namespace.CONDOR, key=f"+{key}", value=f'"{value}"')) 

361 

362 for key, value in gwf_job.attrs.items(): 

363 _LOG.debug("create_job: attrs = %s", gwf_job.attrs) 

364 job.addProfile(Profile(Namespace.CONDOR, key=f"+{key}", value=f'"{value}"')) 

365 

366 job.addProfile(Profile(Namespace.CONDOR, key="+bps_job_name", value=f'"{gwf_job.name}"')) 

367 job.addProfile(Profile(Namespace.CONDOR, key="+bps_job_label", value=f'"{gwf_job.label}"')) 

368 if "quanta_summary" in gwf_job.tags: 

369 job.addProfile(Profile(Namespace.CONDOR, key="+bps_job_quanta", 

370 value=f"\"{gwf_job.tags['quanta_summary']}\"")) 

371 

372 # Specify job's inputs. 

373 for gwf_file in generic_workflow.get_job_inputs(gwf_job.name, data=True, transfer_only=True): 

374 peg_file = peg_files[gwf_file.name] 

375 job.uses(peg_file, link=Link.INPUT) 

376 for pfn in peg_file.pfns: 

377 self.replica_catalog.add(peg_file.name, pfn.url, pfn.site) 

378 

379 # Specify job's outputs 

380 for gwf_file in generic_workflow.get_job_outputs(gwf_job.name, data=True, transfer_only=True): 

381 peg_file = peg_files[gwf_file.name] 

382 job.uses(peg_file, link=Link.OUTPUT) 

383 for pfn in peg_file.pfns: 

384 self.replica_catalog.add(peg_file.name, pfn.url, pfn.site) 

385 

386 return job 

387 

388 def _define_sites(self, out_prefix): 

389 """Create Pegasus Site Catalog 

390 

391 Parameters 

392 ---------- 

393 out_prefix : `str` 

394 Directory prefix for the site catalog file. 

395 

396 Notes 

397 ----- 

398 SitesCatalog needs workdir at initialization to create local site for 

399 submit side directory where the output data from the workflow will be 

400 stored. 

401 """ 

402 self.sites_catalog = sites_catalog.SitesCatalog(out_prefix, f"{self.name}_sites.xml") 

403 

404 # Adding information for all sites defined in config instead of 

405 # limiting to those actually used by the workflow 

406 for site, site_data in self.config["site"].items(): 

407 self.sites_catalog.add_site(site, arch=site_data["arch"], os=site_data["os"]) 

408 if "directory" in site_data: 

409 # Workaround because no Python API 

410 dir_dict = {} 

411 for site_dir in site_data["directory"]: 

412 dir_dict[site_dir] = {"path": site_data["directory"][site_dir]["path"]} 

413 self.sites_catalog._sites[site]["directories"] = dir_dict 

414 

415 # add config provided site attributes 

416 if "profile" in site_data: 

417 for pname, pdata in site_data["profile"].items(): 

418 for key, val in pdata.items(): 

419 self.sites_catalog.add_site_profile(site, namespace=pname, key=key, value=val) 

420 self.sites_catalog.add_site_profile(site, namespace=Namespace.DAGMAN, key="NODE_STATUS_FILE", 

421 value=f"{self.name}.node_status") 

422 

423 def write(self, out_prefix): 

424 """Write Pegasus Catalogs and DAX to files. 

425 

426 Parameters 

427 ---------- 

428 out_prefix : `str` 

429 Directory prefix for all the Pegasus workflow files. 

430 """ 

431 self.submit_path = out_prefix 

432 

433 # filenames needed for properties file 

434 filenames = {} 

435 

436 # Write down the workflow in DAX format. 

437 self.dax_filename = f"{self.dax.name}.dax" 

438 if out_prefix is not None: 

439 os.makedirs(out_prefix, exist_ok=True) 

440 self.dax_filename = os.path.join(out_prefix, self.dax_filename) 

441 with open(self.dax_filename, "w") as outfh: 

442 self.dax.writeXML(outfh) 

443 

444 # output site catalog 

445 filename = f"{self.name}_sites.xml" 

446 if "scFile" not in self.config: 

447 self._define_sites(out_prefix) 

448 self.sites_catalog.workflow_dir = out_prefix 

449 self.sites_catalog.filename = filename 

450 self.sites_catalog.write() 

451 else: 

452 shutil.copy(self.config["sitesFile"], os.path.join(self.submit_path, filename)) 

453 filenames["sites"] = filename 

454 

455 # output transformation catalog 

456 filename = f"{self.name}_tc.txt" 

457 if self.transformation_catalog is not None: 

458 self.transformation_catalog.workflow_dir = out_prefix 

459 self.transformation_catalog.filename = filename 

460 self.transformation_catalog.write() 

461 else: 

462 shutil.copy(self.config["tcFile"], os.path.join(self.submit_path, filename)) 

463 filenames["transformation"] = filename 

464 

465 # output replica catalog 

466 filename = f"{self.name}_rc.txt" 

467 if self.replica_catalog is not None: 

468 self.replica_catalog.workflow_dir = out_prefix 

469 self.replica_catalog.filename = filename 

470 self.replica_catalog.write() 

471 else: 

472 shutil.copy(self.config["tcFile"], os.path.join(self.submit_path, filename)) 

473 filenames["replica"] = filename 

474 

475 self.properties_filename = self._write_properties_file(out_prefix, filenames) 

476 

477 def run_pegasus_plan(self, out_prefix, run_attr): 

478 """Execute pegasus-plan to convert DAX to HTCondor DAG for submission. 

479 

480 Parameters 

481 ---------- 

482 out_prefix : `str` 

483 Root directory in which to output all files. 

484 run_attr : `dict` 

485 Attributes to add to main DAG. 

486 """ 

487 cmd = f"pegasus-plan --verbose --conf {self.properties_filename} --dax {self.dax_filename} --dir " \ 

488 f"{out_prefix}/peg --cleanup none --sites {self.config['computeSite']} " \ 

489 f"--input-dir {out_prefix}/input --output-dir {out_prefix}/output" 

490 _LOG.debug("Plan command: %s", cmd) 

491 pegout = f"{self.submit_path}/{self.name}_pegasus-plan.out" 

492 with chdir(self.submit_path): 

493 _LOG.debug("pegasus-plan in directory: %s", os.getcwd()) 

494 _LOG.debug("pegasus-plan output in %s", pegout) 

495 with open(pegout, "w") as pegfh: 

496 print(f"Command: {cmd}\n", file=pegfh) # Note: want blank line 

497 process = subprocess.run(shlex.split(cmd), shell=False, stdout=pegfh, 

498 stderr=subprocess.STDOUT, check=False) 

499 if process.returncode != 0: 

500 print(f"Error trying to generate Pegasus files. See {pegout}.") 

501 raise RuntimeError(f"pegasus-plan exited with non-zero exit code ({process.returncode})") 

502 

503 # Grab run id from pegasus-plan output and save 

504 with open(pegout, "r") as pegfh: 

505 for line in pegfh: 

506 match = re.search(r"pegasus-run\s+(\S+)", line) 

507 if match: 

508 self.run_id = match.group(1) 

509 break 

510 

511 # Hack - Using profile in sites.xml doesn't add run attributes to DAG 

512 # submission file. So adding them here: 

513 if run_attr is not None: 

514 subname = f"{self.run_id}/{self.name}-0.dag.condor.sub" 

515 shutil.copyfile(subname, subname + ".orig") 

516 with open(subname + ".orig", "r") as infh: 

517 with open(subname, "w") as outfh: 

518 for line in infh: 

519 line = line.strip() 

520 if line == "queue": 

521 htc_write_attribs(outfh, run_attr) 

522 htc_write_attribs(outfh, {"bps_job_label": "DAG"}) 

523 print(line, file=outfh) 

524 

525 def _write_properties_file(self, out_prefix, filenames): 

526 """Write Pegasus Properties File. 

527 

528 Parameters 

529 ---------- 

530 out_prefix : `str` 

531 Directory prefix for properties file. 

532 filenames : `dict` [`str`, `str`] 

533 Mapping of Pegasus file keys to filenames. 

534 

535 Returns 

536 ------- 

537 properties : `str` 

538 Filename of the pegasus properties file. 

539 """ 

540 properties = f"{self.name}_pegasus.properties" 

541 if out_prefix is not None: 

542 properties = os.path.join(out_prefix, properties) 

543 with open(properties, "w") as outfh: 

544 print("# This tells Pegasus where to find the Site Catalog.", file=outfh) 

545 print(f"pegasus.catalog.site.file={filenames['sites']}", file=outfh) 

546 

547 print("# This tells Pegasus where to find the Replica Catalog.", file=outfh) 

548 print(f"pegasus.catalog.replica.file={filenames['replica']}", file=outfh) 

549 

550 print("# This tells Pegasus where to find the Transformation Catalog.", file=outfh) 

551 print("pegasus.catalog.transformation=Text", file=outfh) 

552 print(f"pegasus.catalog.transformation.file={filenames['transformation']}", file=outfh) 

553 

554 print("# Run Pegasus in shared file system mode.", file=outfh) 

555 print("pegasus.data.configuration=sharedfs", file=outfh) 

556 

557 print("# Make Pegasus use links instead of transferring files.", file=outfh) 

558 print("pegasus.transfer.*.impl=Transfer", file=outfh) 

559 print("pegasus.transfer.links=true", file=outfh) 

560 

561 return properties