Coverage for python/lsst/ctrl/bps/wms/pegasus/pegasus_service.py: 3%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

246 statements  

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22"""Support for using Pegasus WMS. 

23""" 

24 

25__all__ = ["PegasusService", "PegasusWorkflow"] 

26 

27 

28import copy 

29import logging 

30import os 

31import re 

32import shlex 

33import shutil 

34import subprocess 

35 

36from Pegasus.catalogs import replica_catalog, sites_catalog, transformation_catalog 

37from Pegasus.DAX3 import ADAG, PFN, Executable, File, Job, Link, Namespace, Profile 

38 

39from ... import BaseWmsService, BaseWmsWorkflow 

40from ...bps_utils import chdir 

41from ..htcondor import HTCondorService, htc_write_attribs 

42 

43_LOG = logging.getLogger(__name__) 

44 

45 

46class PegasusService(BaseWmsService): 

47 """Pegasus version of workflow engine.""" 

48 

49 def prepare(self, config, generic_workflow, out_prefix=None): 

50 """Create submission for a generic workflow in a specific WMS. 

51 

52 Parameters 

53 ---------- 

54 config : `lsst.ctrl.bps.BpsConfig` 

55 BPS configuration that includes necessary submit/runtime 

56 information. 

57 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

58 The generic workflow (e.g., has executable name and arguments) 

59 out_prefix : `str` 

60 The root directory into which all WMS-specific files are written. 

61 

62 Returns 

63 ---------- 

64 peg_workflow : `lsst.ctrl.bps.wms.pegasus.PegasusWorkflow` 

65 A workflow ready for Pegasus to run. 

66 """ 

67 service_class = f"{self.__class__.__module__}.{self.__class__.__name__}" 

68 peg_workflow = PegasusWorkflow.from_generic_workflow( 

69 config, generic_workflow, out_prefix, service_class 

70 ) 

71 peg_workflow.write(out_prefix) 

72 peg_workflow.run_pegasus_plan(out_prefix, generic_workflow.run_attrs) 

73 return peg_workflow 

74 

75 def submit(self, workflow): 

76 """Submit a single WMS workflow 

77 

78 Parameters 

79 ---------- 

80 workflow : `lsst.ctrl.bps.BaseWorkflow` 

81 A single HTCondor workflow to submit 

82 """ 

83 with chdir(workflow.submit_path): 

84 _LOG.info("Submitting from directory: %s", os.getcwd()) 

85 command = f"pegasus-run {workflow.run_id}" 

86 with open(f"{workflow.name}_pegasus-run.out", "w") as outfh: 

87 process = subprocess.Popen( 

88 shlex.split(command), shell=False, stdout=outfh, stderr=subprocess.STDOUT 

89 ) 

90 process.wait() 

91 

92 if process.returncode != 0: 

93 raise RuntimeError("pegasus-run exited with non-zero exit code (%s)" % process.returncode) 

94 

95 # Note: 

96 # 

97 # No need to save run id as the same as the run id generated when 

98 # running pegasus-plan earlier. 

99 

100 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=True): 

101 """Query WMS for list of submitted WMS workflows/jobs. 

102 

103 This should be a quick lookup function to create list of jobs for 

104 other functions. 

105 

106 Parameters 

107 ---------- 

108 wms_id : `int` or `str`, optional 

109 Id or path that can be used by WMS service to look up job. 

110 user : `str`, optional 

111 User whose submitted jobs should be listed. 

112 require_bps : `bool`, optional 

113 Whether to require jobs returned in list to be bps-submitted jobs. 

114 pass_thru : `str`, optional 

115 Information to pass through to WMS. 

116 is_global : `bool`, optional 

117 If set, all job queues (and their histories) will be queried for 

118 job information. Defaults to False which means that only the local 

119 job queue will be queried. 

120 

121 Returns 

122 ------- 

123 job_ids : `list` [`Any`] 

124 Only job ids to be used by cancel and other functions. Typically 

125 this means top-level jobs (i.e., not children jobs). 

126 """ 

127 htc_service = HTCondorService(self.config) 

128 return htc_service.list_submitted_jobs(wms_id, user, require_bps, pass_thru, is_global) 

129 

130 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=True): 

131 """Query WMS for status of submitted WMS workflows 

132 Parameters 

133 ---------- 

134 wms_workflow_id : `int` or `str`, optional 

135 Id that can be used by WMS service to look up status. 

136 user : `str`, optional 

137 Limit report to submissions by this particular user 

138 hist : `int`, optional 

139 Number of days to expand report to include finished WMS workflows. 

140 pass_thru : `str`, optional 

141 Additional arguments to pass through to the specific WMS service. 

142 is_global : `bool`, optional 

143 If set, all job queues (and their histories) will be queried for 

144 job information. Defaults to False which means that only the local 

145 job queue will be queried. 

146 

147 Returns 

148 ------- 

149 run_reports : `list` [`lsst.ctrl.bps.BaseWmsReport`] 

150 Status information for submitted WMS workflows 

151 message : `str` 

152 Message to user on how to find more status information specific to 

153 WMS. 

154 """ 

155 htc_service = HTCondorService(self.config) 

156 return htc_service.report(wms_workflow_id, user, hist, pass_thru, is_global) 

157 

158 def cancel(self, wms_id, pass_thru=None): 

159 """Cancel submitted workflows/jobs. 

160 

161 Parameters 

162 ---------- 

163 wms_id : `str` 

164 ID or path of job that should be canceled. 

165 pass_thru : `str`, optional 

166 Information to pass through to WMS. 

167 

168 Returns 

169 -------- 

170 deleted : `bool` 

171 Whether successful deletion or not. Currently, if any doubt or any 

172 individual jobs not deleted, return False. 

173 message : `str` 

174 Any message from WMS (e.g., error details). 

175 """ 

176 _LOG.debug("Canceling wms_id = %s", wms_id) 

177 

178 # if wms_id is a numeric HTCondor id, use HTCondor plugin to delete 

179 try: 

180 float(wms_id) 

181 htc_service = HTCondorService(self.config) 

182 deleted, message = htc_service.cancel(wms_id, pass_thru) 

183 except ValueError: 

184 command = f"pegasus-remove {wms_id}" 

185 _LOG.debug(command) 

186 completed_process = subprocess.run( 

187 shlex.split(command), 

188 shell=False, 

189 check=False, 

190 stdout=subprocess.PIPE, 

191 stderr=subprocess.STDOUT, 

192 ) 

193 _LOG.debug(completed_process.stdout) 

194 _LOG.debug("Return code = %s", completed_process.returncode) 

195 

196 if completed_process.returncode != 0: 

197 deleted = False 

198 m = re.match(b"443", completed_process.stdout) 

199 if m: 

200 message = "no such bps job in batch queue" 

201 else: 

202 message = f"pegasus-remove exited with non-zero exit code {completed_process.returncode}" 

203 print("XXX", completed_process.stdout.decode(), "XXX") 

204 print(message) 

205 else: 

206 deleted = True 

207 

208 return deleted, message 

209 

210 

211class PegasusWorkflow(BaseWmsWorkflow): 

212 """Single Pegasus Workflow 

213 

214 Parameters 

215 ---------- 

216 name : `str` 

217 Name of workflow. 

218 config : `lsst.ctrl.bps.BpsConfig` 

219 BPS configuration that includes necessary submit/runtime information. 

220 """ 

221 

222 def __init__(self, name, config): 

223 # config, run_id, submit_path 

224 super().__init__(name, config) 

225 self.dax = ADAG(name) 

226 self.run_attrs = None 

227 

228 self.replica_catalog = None 

229 self.sites_catalog = None 

230 self.transformation_catalog = None 

231 self._init_catalogs() 

232 self.properties_filename = None 

233 self.dax_filename = None 

234 

235 def _init_catalogs(self): 

236 # Set workdir in catalogs at write time. So pass None as value here. 

237 

238 # Replica Catalog keeps mappings of logical file ids/names (LFN's) to 

239 # physical file ids/names (PFN's) 

240 if "rcFile" not in self.config: 

241 fname = "rc.txt" 

242 self.replica_catalog = replica_catalog.ReplicaCatalog(None, fname) 

243 

244 # Transformation Catalog describes all of the executables 

245 # (called "transformations") used by the workflow. 

246 if "tcFile" not in self.config: 

247 fname = "tc.txt" 

248 self.transformation_catalog = transformation_catalog.TransformationCatalog(None, fname) 

249 

250 # Note: 

251 # 

252 # SitesCatalog needs workdir at initialization to create local site 

253 # for submit side directory where the output data from the workflow 

254 # will be stored. So delaying creation of SitesCatalog until all the 

255 # write function is called with a given output directory. 

256 

257 @classmethod 

258 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

259 # Docstring inherited. 

260 peg_workflow = cls(generic_workflow.name, config) 

261 peg_workflow.run_attrs = copy.deepcopy(generic_workflow.run_attrs) 

262 peg_workflow.run_attrs["bps_wms_service"] = service_class 

263 peg_workflow.run_attrs["bps_wms_workflow"] = f"{cls.__module__}.{cls.__name__}" 

264 

265 # Create initial Pegasus File objects for all files that WMS must 

266 # handle. 

267 peg_files = {} 

268 for gwf_file in generic_workflow.get_files(data=True, transfer_only=True): 

269 if gwf_file.wms_transfer: 

270 peg_file = File(gwf_file.name) 

271 peg_file.addPFN(PFN(f"file://{gwf_file.src_uri}", "local")) 

272 peg_files[gwf_file.name] = peg_file 

273 

274 # Add jobs to the DAX. 

275 for job_name in generic_workflow: 

276 gwf_job = generic_workflow.get_job(job_name) 

277 job = peg_workflow.create_job(generic_workflow, gwf_job, peg_files) 

278 peg_workflow.dax.addJob(job) 

279 

280 # Add job dependencies to the DAX. 

281 for job_name in generic_workflow: 

282 for child_name in generic_workflow.successors(job_name): 

283 peg_workflow.dax.depends( 

284 parent=peg_workflow.dax.getJob(job_name), child=peg_workflow.dax.getJob(child_name) 

285 ) 

286 

287 return peg_workflow 

288 

289 def create_job(self, generic_workflow, gwf_job, peg_files): 

290 """Create a Pegasus job corresponding to the given GenericWorkflow job. 

291 

292 Parameters 

293 ---------- 

294 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

295 Generic workflow that is being converted. 

296 gwf_job : `lsst.ctrl.bps.GenericWorkflowJob` 

297 The generic job to convert to a Pegasus job. 

298 peg_files : `dict` [`str`, `Pegasus.DAX3.File`] 

299 Pegasus Files needed when creating Pegasus Job. 

300 

301 Returns 

302 ------- 

303 job : `Pegasus.DAX3.Job` 

304 Pegasus job created from the generic workflow job. 

305 

306 Notes 

307 ----- 

308 https://pegasus.isi.edu/documentation/reference-guide/variable 

309 -expansion.html Says that ${VAR} gets expanded with submit side 

310 values during pegasus-plan. If try $VAR (which isn't supposed to get 

311 expanded by pegasus-plan), the environment variable (e.g., 

312 ${CTRL_MPEXEC_DIR} gets completely dropped from the executable path 

313 and job dies because cannot find executable (/bin/pipetask). 

314 

315 So, currently Pegasus plugin only works if environment variables used 

316 in commands are same on submit machine and compute machine. 

317 """ 

318 _LOG.debug("GenericWorkflowJob=%s", gwf_job) 

319 

320 # Save transformation. 

321 executable = Executable(gwf_job.executable.name, installed=not gwf_job.executable.transfer_executable) 

322 newexec = re.sub(r"<ENV:([^>]+)>", r"${\1}", gwf_job.executable.src_uri) 

323 _LOG.debug("Executable after replacing any environment variables = %s", newexec) 

324 executable.addPFN(PFN(f"file://{newexec}", gwf_job.compute_site)) 

325 self.transformation_catalog.add(executable) 

326 

327 # Create Pegasus Job. 

328 job = Job(gwf_job.executable.name, id=gwf_job.name, node_label=gwf_job.label) 

329 

330 if gwf_job.arguments: 

331 arguments = gwf_job.arguments 

332 # Replace command variables 

333 arguments = arguments.format(**gwf_job.cmdvals) 

334 

335 # Replace env vars 

336 arguments = re.sub(r"<ENV:([^>]+)>", r"${\1}", arguments) 

337 _LOG.debug("Arguments after replacing any environment variables = %s", arguments) 

338 

339 # Replace file placeholders 

340 arguments = re.sub(r"<FILE:([^>]+)>", r"\1", arguments) 

341 _LOG.debug("Command line arguments: %s", arguments) 

342 

343 # Break up command string into separate args for Pegasus Job object 

344 # replacing file names with Pegasus File objects 

345 args = arguments.split() 

346 logical_file_names = list(set(peg_files) & set(args)) 

347 if logical_file_names: 

348 indices = [args.index(lfn) for lfn in logical_file_names] 

349 for idx, lfn in zip(indices, logical_file_names): 

350 args[idx] = peg_files[lfn] 

351 

352 job.addArguments(*args) 

353 else: 

354 _LOG.warning("Job %s does not have any arguments", gwf_job.name) 

355 

356 if gwf_job.request_memory: # MB 

357 job.addProfile(Profile(Namespace.CONDOR, "request_memory", gwf_job.request_memory)) 

358 if gwf_job.request_cpus: # cores 

359 job.addProfile(Profile(Namespace.CONDOR, "request_cpus", gwf_job.request_cpus)) 

360 if gwf_job.request_disk: # MB 

361 job.addProfile(Profile(Namespace.CONDOR, "request_disk", gwf_job.request_disk)) 

362 if gwf_job.priority: # MB 

363 job.addProfile(Profile(Namespace.CONDOR, "priority", gwf_job.priority)) 

364 

365 # Add extra job attributes 

366 for key, value in gwf_job.profile.items(): 

367 job.addProfile(Profile(Namespace.CONDOR, key, value)) 

368 

369 for key, value in gwf_job.environment.items(): 

370 job.addProfile(Profile(Namespace.ENV, key, value)) 

371 

372 # Add run attributes 

373 for key, value in self.run_attrs.items(): 

374 job.addProfile(Profile(Namespace.CONDOR, key=f"+{key}", value=f'"{value}"')) 

375 

376 for key, value in gwf_job.attrs.items(): 

377 _LOG.debug("create_job: attrs = %s", gwf_job.attrs) 

378 job.addProfile(Profile(Namespace.CONDOR, key=f"+{key}", value=f'"{value}"')) 

379 

380 job.addProfile(Profile(Namespace.CONDOR, key="+bps_job_name", value=f'"{gwf_job.name}"')) 

381 job.addProfile(Profile(Namespace.CONDOR, key="+bps_job_label", value=f'"{gwf_job.label}"')) 

382 if "quanta_summary" in gwf_job.tags: 

383 job.addProfile( 

384 Profile( 

385 Namespace.CONDOR, key="+bps_job_quanta", value=f"\"{gwf_job.tags['quanta_summary']}\"" 

386 ) 

387 ) 

388 

389 # Specify job's inputs. 

390 for gwf_file in generic_workflow.get_job_inputs(gwf_job.name, data=True, transfer_only=True): 

391 peg_file = peg_files[gwf_file.name] 

392 job.uses(peg_file, link=Link.INPUT) 

393 for pfn in peg_file.pfns: 

394 self.replica_catalog.add(peg_file.name, pfn.url, pfn.site) 

395 

396 # Specify job's outputs 

397 for gwf_file in generic_workflow.get_job_outputs(gwf_job.name, data=True, transfer_only=True): 

398 peg_file = peg_files[gwf_file.name] 

399 job.uses(peg_file, link=Link.OUTPUT) 

400 for pfn in peg_file.pfns: 

401 self.replica_catalog.add(peg_file.name, pfn.url, pfn.site) 

402 

403 return job 

404 

405 def _define_sites(self, out_prefix): 

406 """Create Pegasus Site Catalog 

407 

408 Parameters 

409 ---------- 

410 out_prefix : `str` 

411 Directory prefix for the site catalog file. 

412 

413 Notes 

414 ----- 

415 SitesCatalog needs workdir at initialization to create local site for 

416 submit side directory where the output data from the workflow will be 

417 stored. 

418 """ 

419 self.sites_catalog = sites_catalog.SitesCatalog(out_prefix, f"{self.name}_sites.xml") 

420 

421 # Adding information for all sites defined in config instead of 

422 # limiting to those actually used by the workflow 

423 for site, site_data in self.config["site"].items(): 

424 self.sites_catalog.add_site(site, arch=site_data["arch"], os=site_data["os"]) 

425 if "directory" in site_data: 

426 # Workaround because no Python API 

427 dir_dict = {} 

428 for site_dir in site_data["directory"]: 

429 dir_dict[site_dir] = {"path": site_data["directory"][site_dir]["path"]} 

430 self.sites_catalog._sites[site]["directories"] = dir_dict 

431 

432 # add config provided site attributes 

433 if "profile" in site_data: 

434 for pname, pdata in site_data["profile"].items(): 

435 for key, val in pdata.items(): 

436 self.sites_catalog.add_site_profile(site, namespace=pname, key=key, value=val) 

437 self.sites_catalog.add_site_profile( 

438 site, namespace=Namespace.DAGMAN, key="NODE_STATUS_FILE", value=f"{self.name}.node_status" 

439 ) 

440 

441 def write(self, out_prefix): 

442 """Write Pegasus Catalogs and DAX to files. 

443 

444 Parameters 

445 ---------- 

446 out_prefix : `str` 

447 Directory prefix for all the Pegasus workflow files. 

448 """ 

449 self.submit_path = out_prefix 

450 

451 # filenames needed for properties file 

452 filenames = {} 

453 

454 # Write down the workflow in DAX format. 

455 self.dax_filename = f"{self.dax.name}.dax" 

456 if out_prefix is not None: 

457 os.makedirs(out_prefix, exist_ok=True) 

458 self.dax_filename = os.path.join(out_prefix, self.dax_filename) 

459 with open(self.dax_filename, "w") as outfh: 

460 self.dax.writeXML(outfh) 

461 

462 # output site catalog 

463 filename = f"{self.name}_sites.xml" 

464 if "scFile" not in self.config: 

465 self._define_sites(out_prefix) 

466 self.sites_catalog.workflow_dir = out_prefix 

467 self.sites_catalog.filename = filename 

468 self.sites_catalog.write() 

469 else: 

470 shutil.copy(self.config["sitesFile"], os.path.join(self.submit_path, filename)) 

471 filenames["sites"] = filename 

472 

473 # output transformation catalog 

474 filename = f"{self.name}_tc.txt" 

475 if self.transformation_catalog is not None: 

476 self.transformation_catalog.workflow_dir = out_prefix 

477 self.transformation_catalog.filename = filename 

478 self.transformation_catalog.write() 

479 else: 

480 shutil.copy(self.config["tcFile"], os.path.join(self.submit_path, filename)) 

481 filenames["transformation"] = filename 

482 

483 # output replica catalog 

484 filename = f"{self.name}_rc.txt" 

485 if self.replica_catalog is not None: 

486 self.replica_catalog.workflow_dir = out_prefix 

487 self.replica_catalog.filename = filename 

488 self.replica_catalog.write() 

489 else: 

490 shutil.copy(self.config["tcFile"], os.path.join(self.submit_path, filename)) 

491 filenames["replica"] = filename 

492 

493 self.properties_filename = self._write_properties_file(out_prefix, filenames) 

494 

495 def run_pegasus_plan(self, out_prefix, run_attr): 

496 """Execute pegasus-plan to convert DAX to HTCondor DAG for submission. 

497 

498 Parameters 

499 ---------- 

500 out_prefix : `str` 

501 Root directory in which to output all files. 

502 run_attr : `dict` 

503 Attributes to add to main DAG. 

504 """ 

505 cmd = ( 

506 f"pegasus-plan --verbose --conf {self.properties_filename} --dax {self.dax_filename} --dir " 

507 f"{out_prefix}/peg --cleanup none --sites {self.config['computeSite']} " 

508 f"--input-dir {out_prefix}/input --output-dir {out_prefix}/output" 

509 ) 

510 _LOG.debug("Plan command: %s", cmd) 

511 pegout = f"{self.submit_path}/{self.name}_pegasus-plan.out" 

512 with chdir(self.submit_path): 

513 _LOG.debug("pegasus-plan in directory: %s", os.getcwd()) 

514 _LOG.debug("pegasus-plan output in %s", pegout) 

515 with open(pegout, "w") as pegfh: 

516 print(f"Command: {cmd}\n", file=pegfh) # Note: want blank line 

517 process = subprocess.run( 

518 shlex.split(cmd), shell=False, stdout=pegfh, stderr=subprocess.STDOUT, check=False 

519 ) 

520 if process.returncode != 0: 

521 print(f"Error trying to generate Pegasus files. See {pegout}.") 

522 raise RuntimeError(f"pegasus-plan exited with non-zero exit code ({process.returncode})") 

523 

524 # Grab run id from pegasus-plan output and save 

525 with open(pegout, "r") as pegfh: 

526 for line in pegfh: 

527 match = re.search(r"pegasus-run\s+(\S+)", line) 

528 if match: 

529 self.run_id = match.group(1) 

530 break 

531 

532 # Hack - Using profile in sites.xml doesn't add run attributes to DAG 

533 # submission file. So adding them here: 

534 if run_attr is not None: 

535 subname = f"{self.run_id}/{self.name}-0.dag.condor.sub" 

536 shutil.copyfile(subname, subname + ".orig") 

537 with open(subname + ".orig", "r") as infh: 

538 with open(subname, "w") as outfh: 

539 for line in infh: 

540 line = line.strip() 

541 if line == "queue": 

542 htc_write_attribs(outfh, run_attr) 

543 htc_write_attribs(outfh, {"bps_job_label": "DAG"}) 

544 print(line, file=outfh) 

545 

546 def _write_properties_file(self, out_prefix, filenames): 

547 """Write Pegasus Properties File. 

548 

549 Parameters 

550 ---------- 

551 out_prefix : `str` 

552 Directory prefix for properties file. 

553 filenames : `dict` [`str`, `str`] 

554 Mapping of Pegasus file keys to filenames. 

555 

556 Returns 

557 ------- 

558 properties : `str` 

559 Filename of the pegasus properties file. 

560 """ 

561 properties = f"{self.name}_pegasus.properties" 

562 if out_prefix is not None: 

563 properties = os.path.join(out_prefix, properties) 

564 with open(properties, "w") as outfh: 

565 print("# This tells Pegasus where to find the Site Catalog.", file=outfh) 

566 print(f"pegasus.catalog.site.file={filenames['sites']}", file=outfh) 

567 

568 print("# This tells Pegasus where to find the Replica Catalog.", file=outfh) 

569 print(f"pegasus.catalog.replica.file={filenames['replica']}", file=outfh) 

570 

571 print("# This tells Pegasus where to find the Transformation Catalog.", file=outfh) 

572 print("pegasus.catalog.transformation=Text", file=outfh) 

573 print(f"pegasus.catalog.transformation.file={filenames['transformation']}", file=outfh) 

574 

575 print("# Run Pegasus in shared file system mode.", file=outfh) 

576 print("pegasus.data.configuration=sharedfs", file=outfh) 

577 

578 print("# Make Pegasus use links instead of transferring files.", file=outfh) 

579 print("pegasus.transfer.*.impl=Transfer", file=outfh) 

580 print("pegasus.transfer.links=true", file=outfh) 

581 

582 return properties