Coverage for python/lsst/ctrl/bps/wms/pegasus/pegasus_service.py: 3%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

246 statements  

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22"""Support for using Pegasus WMS. 

23""" 

24 

25__all__ = ["PegasusService", "PegasusWorkflow"] 

26 

27 

28import os 

29import copy 

30import re 

31import subprocess 

32import shlex 

33import shutil 

34import logging 

35 

36from Pegasus.DAX3 import ADAG, File, Job, Link, PFN, Executable, Profile, Namespace 

37from Pegasus.catalogs import replica_catalog, sites_catalog, transformation_catalog 

38 

39from ... import BaseWmsService, BaseWmsWorkflow 

40from ...bps_utils import chdir 

41from ..htcondor import HTCondorService, htc_write_attribs 

42 

43 

44_LOG = logging.getLogger(__name__) 

45 

46 

47class PegasusService(BaseWmsService): 

48 """Pegasus version of workflow engine. 

49 """ 

50 def prepare(self, config, generic_workflow, out_prefix=None): 

51 """Create submission for a generic workflow in a specific WMS. 

52 

53 Parameters 

54 ---------- 

55 config : `lsst.ctrl.bps.BpsConfig` 

56 BPS configuration that includes necessary submit/runtime 

57 information. 

58 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

59 The generic workflow (e.g., has executable name and arguments) 

60 out_prefix : `str` 

61 The root directory into which all WMS-specific files are written. 

62 

63 Returns 

64 ---------- 

65 peg_workflow : `lsst.ctrl.bps.wms.pegasus.PegasusWorkflow` 

66 A workflow ready for Pegasus to run. 

67 """ 

68 service_class = f"{self.__class__.__module__}.{self.__class__.__name__}" 

69 peg_workflow = PegasusWorkflow.from_generic_workflow(config, generic_workflow, out_prefix, 

70 service_class) 

71 peg_workflow.write(out_prefix) 

72 peg_workflow.run_pegasus_plan(out_prefix, generic_workflow.run_attrs) 

73 return peg_workflow 

74 

75 def submit(self, workflow): 

76 """Submit a single WMS workflow 

77 

78 Parameters 

79 ---------- 

80 workflow : `lsst.ctrl.bps.BaseWorkflow` 

81 A single HTCondor workflow to submit 

82 """ 

83 with chdir(workflow.submit_path): 

84 _LOG.info("Submitting from directory: %s", os.getcwd()) 

85 command = f"pegasus-run {workflow.run_id}" 

86 with open(f"{workflow.name}_pegasus-run.out", "w") as outfh: 

87 process = subprocess.Popen(shlex.split(command), shell=False, stdout=outfh, 

88 stderr=subprocess.STDOUT) 

89 process.wait() 

90 

91 if process.returncode != 0: 

92 raise RuntimeError("pegasus-run exited with non-zero exit code (%s)" % process.returncode) 

93 

94 # Note: 

95 # 

96 # No need to save run id as the same as the run id generated when 

97 # running pegasus-plan earlier. 

98 

99 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=True): 

100 """Query WMS for list of submitted WMS workflows/jobs. 

101 

102 This should be a quick lookup function to create list of jobs for 

103 other functions. 

104 

105 Parameters 

106 ---------- 

107 wms_id : `int` or `str`, optional 

108 Id or path that can be used by WMS service to look up job. 

109 user : `str`, optional 

110 User whose submitted jobs should be listed. 

111 require_bps : `bool`, optional 

112 Whether to require jobs returned in list to be bps-submitted jobs. 

113 pass_thru : `str`, optional 

114 Information to pass through to WMS. 

115 is_global : `bool`, optional 

116 If set, all job queues (and their histories) will be queried for 

117 job information. Defaults to False which means that only the local 

118 job queue will be queried. 

119 

120 Returns 

121 ------- 

122 job_ids : `list` [`Any`] 

123 Only job ids to be used by cancel and other functions. Typically 

124 this means top-level jobs (i.e., not children jobs). 

125 """ 

126 htc_service = HTCondorService(self.config) 

127 return htc_service.list_submitted_jobs(wms_id, user, require_bps, pass_thru, is_global) 

128 

129 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=True): 

130 """Query WMS for status of submitted WMS workflows 

131 Parameters 

132 ---------- 

133 wms_workflow_id : `int` or `str`, optional 

134 Id that can be used by WMS service to look up status. 

135 user : `str`, optional 

136 Limit report to submissions by this particular user 

137 hist : `int`, optional 

138 Number of days to expand report to include finished WMS workflows. 

139 pass_thru : `str`, optional 

140 Additional arguments to pass through to the specific WMS service. 

141 is_global : `bool`, optional 

142 If set, all job queues (and their histories) will be queried for 

143 job information. Defaults to False which means that only the local 

144 job queue will be queried. 

145 

146 Returns 

147 ------- 

148 run_reports : `list` [`lsst.ctrl.bps.BaseWmsReport`] 

149 Status information for submitted WMS workflows 

150 message : `str` 

151 Message to user on how to find more status information specific to 

152 WMS. 

153 """ 

154 htc_service = HTCondorService(self.config) 

155 return htc_service.report(wms_workflow_id, user, hist, pass_thru, is_global) 

156 

157 def cancel(self, wms_id, pass_thru=None): 

158 """Cancel submitted workflows/jobs. 

159 

160 Parameters 

161 ---------- 

162 wms_id : `str` 

163 ID or path of job that should be canceled. 

164 pass_thru : `str`, optional 

165 Information to pass through to WMS. 

166 

167 Returns 

168 -------- 

169 deleted : `bool` 

170 Whether successful deletion or not. Currently, if any doubt or any 

171 individual jobs not deleted, return False. 

172 message : `str` 

173 Any message from WMS (e.g., error details). 

174 """ 

175 _LOG.debug("Canceling wms_id = %s", wms_id) 

176 

177 # if wms_id is a numeric HTCondor id, use HTCondor plugin to delete 

178 try: 

179 float(wms_id) 

180 htc_service = HTCondorService(self.config) 

181 deleted, message = htc_service.cancel(wms_id, pass_thru) 

182 except ValueError: 

183 command = f"pegasus-remove {wms_id}" 

184 _LOG.debug(command) 

185 completed_process = subprocess.run(shlex.split(command), shell=False, check=False, 

186 stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 

187 _LOG.debug(completed_process.stdout) 

188 _LOG.debug("Return code = %s", completed_process.returncode) 

189 

190 if completed_process.returncode != 0: 

191 deleted = False 

192 m = re.match(b"443", completed_process.stdout) 

193 if m: 

194 message = "no such bps job in batch queue" 

195 else: 

196 message = f"pegasus-remove exited with non-zero exit code {completed_process.returncode}" 

197 print("XXX", completed_process.stdout.decode(), "XXX") 

198 print(message) 

199 else: 

200 deleted = True 

201 

202 return deleted, message 

203 

204 

205class PegasusWorkflow(BaseWmsWorkflow): 

206 """Single Pegasus Workflow 

207 

208 Parameters 

209 ---------- 

210 name : `str` 

211 Name of workflow. 

212 config : `lsst.ctrl.bps.BpsConfig` 

213 BPS configuration that includes necessary submit/runtime information. 

214 """ 

215 

216 def __init__(self, name, config): 

217 # config, run_id, submit_path 

218 super().__init__(name, config) 

219 self.dax = ADAG(name) 

220 self.run_attrs = None 

221 

222 self.replica_catalog = None 

223 self.sites_catalog = None 

224 self.transformation_catalog = None 

225 self._init_catalogs() 

226 self.properties_filename = None 

227 self.dax_filename = None 

228 

229 def _init_catalogs(self): 

230 # Set workdir in catalogs at write time. So pass None as value here. 

231 

232 # Replica Catalog keeps mappings of logical file ids/names (LFN's) to 

233 # physical file ids/names (PFN's) 

234 if "rcFile" not in self.config: 

235 fname = "rc.txt" 

236 self.replica_catalog = replica_catalog.ReplicaCatalog(None, fname) 

237 

238 # Transformation Catalog describes all of the executables 

239 # (called "transformations") used by the workflow. 

240 if "tcFile" not in self.config: 

241 fname = "tc.txt" 

242 self.transformation_catalog = transformation_catalog.TransformationCatalog(None, fname) 

243 

244 # Note: 

245 # 

246 # SitesCatalog needs workdir at initialization to create local site 

247 # for submit side directory where the output data from the workflow 

248 # will be stored. So delaying creation of SitesCatalog until all the 

249 # write function is called with a given output directory. 

250 

251 @classmethod 

252 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

253 # Docstring inherited. 

254 peg_workflow = cls(generic_workflow.name, config) 

255 peg_workflow.run_attrs = copy.deepcopy(generic_workflow.run_attrs) 

256 peg_workflow.run_attrs["bps_wms_service"] = service_class 

257 peg_workflow.run_attrs["bps_wms_workflow"] = f"{cls.__module__}.{cls.__name__}" 

258 

259 # Create initial Pegasus File objects for all files that WMS must 

260 # handle. 

261 peg_files = {} 

262 for gwf_file in generic_workflow.get_files(data=True, transfer_only=True): 

263 if gwf_file.wms_transfer: 

264 peg_file = File(gwf_file.name) 

265 peg_file.addPFN(PFN(f"file://{gwf_file.src_uri}", "local")) 

266 peg_files[gwf_file.name] = peg_file 

267 

268 # Add jobs to the DAX. 

269 for job_name in generic_workflow: 

270 gwf_job = generic_workflow.get_job(job_name) 

271 job = peg_workflow.create_job(generic_workflow, gwf_job, peg_files) 

272 peg_workflow.dax.addJob(job) 

273 

274 # Add job dependencies to the DAX. 

275 for job_name in generic_workflow: 

276 for child_name in generic_workflow.successors(job_name): 

277 peg_workflow.dax.depends(parent=peg_workflow.dax.getJob(job_name), 

278 child=peg_workflow.dax.getJob(child_name)) 

279 

280 return peg_workflow 

281 

282 def create_job(self, generic_workflow, gwf_job, peg_files): 

283 """Create a Pegasus job corresponding to the given GenericWorkflow job. 

284 

285 Parameters 

286 ---------- 

287 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

288 Generic workflow that is being converted. 

289 gwf_job : `lsst.ctrl.bps.GenericWorkflowJob` 

290 The generic job to convert to a Pegasus job. 

291 peg_files : `dict` [`str`, `Pegasus.DAX3.File`] 

292 Pegasus Files needed when creating Pegasus Job. 

293 

294 Returns 

295 ------- 

296 job : `Pegasus.DAX3.Job` 

297 Pegasus job created from the generic workflow job. 

298 

299 Notes 

300 ----- 

301 https://pegasus.isi.edu/documentation/reference-guide/variable 

302 -expansion.html Says that ${VAR} gets expanded with submit side 

303 values during pegasus-plan. If try $VAR (which isn't supposed to get 

304 expanded by pegasus-plan), the environment variable (e.g., 

305 ${CTRL_MPEXEC_DIR} gets completely dropped from the executable path 

306 and job dies because cannot find executable (/bin/pipetask). 

307 

308 So, currently Pegasus plugin only works if environment variables used 

309 in commands are same on submit machine and compute machine. 

310 """ 

311 _LOG.debug("GenericWorkflowJob=%s", gwf_job) 

312 

313 # Save transformation. 

314 executable = Executable(gwf_job.executable.name, 

315 installed=not gwf_job.executable.transfer_executable) 

316 newexec = re.sub(r"<ENV:([^>]+)>", r"${\1}", gwf_job.executable.src_uri) 

317 _LOG.debug("Executable after replacing any environment variables = %s", newexec) 

318 executable.addPFN(PFN(f"file://{newexec}", gwf_job.compute_site)) 

319 self.transformation_catalog.add(executable) 

320 

321 # Create Pegasus Job. 

322 job = Job(gwf_job.executable.name, id=gwf_job.name, node_label=gwf_job.label) 

323 

324 if gwf_job.arguments: 

325 arguments = gwf_job.arguments 

326 # Replace command variables 

327 arguments = arguments.format(**gwf_job.cmdvals) 

328 

329 # Replace env vars 

330 arguments = re.sub(r"<ENV:([^>]+)>", r"${\1}", arguments) 

331 _LOG.debug("Arguments after replacing any environment variables = %s", arguments) 

332 

333 # Replace file placeholders 

334 arguments = re.sub(r"<FILE:([^>]+)>", r"\1", arguments) 

335 _LOG.debug("Command line arguments: %s", arguments) 

336 

337 # Break up command string into separate args for Pegasus Job object 

338 # replacing file names with Pegasus File objects 

339 args = arguments.split() 

340 logical_file_names = list(set(peg_files) & set(args)) 

341 if logical_file_names: 

342 indices = [args.index(lfn) for lfn in logical_file_names] 

343 for idx, lfn in zip(indices, logical_file_names): 

344 args[idx] = peg_files[lfn] 

345 

346 job.addArguments(*args) 

347 else: 

348 _LOG.warning("Job %s does not have any arguments", gwf_job.name) 

349 

350 if gwf_job.request_memory: # MB 

351 job.addProfile(Profile(Namespace.CONDOR, "request_memory", gwf_job.request_memory)) 

352 if gwf_job.request_cpus: # cores 

353 job.addProfile(Profile(Namespace.CONDOR, "request_cpus", gwf_job.request_cpus)) 

354 if gwf_job.request_disk: # MB 

355 job.addProfile(Profile(Namespace.CONDOR, "request_disk", gwf_job.request_disk)) 

356 if gwf_job.priority: # MB 

357 job.addProfile(Profile(Namespace.CONDOR, "priority", gwf_job.priority)) 

358 

359 # Add extra job attributes 

360 for key, value in gwf_job.profile.items(): 

361 job.addProfile(Profile(Namespace.CONDOR, key, value)) 

362 

363 for key, value in gwf_job.environment.items(): 

364 job.addProfile(Profile(Namespace.ENV, key, value)) 

365 

366 # Add run attributes 

367 for key, value in self.run_attrs.items(): 

368 job.addProfile(Profile(Namespace.CONDOR, key=f"+{key}", value=f'"{value}"')) 

369 

370 for key, value in gwf_job.attrs.items(): 

371 _LOG.debug("create_job: attrs = %s", gwf_job.attrs) 

372 job.addProfile(Profile(Namespace.CONDOR, key=f"+{key}", value=f'"{value}"')) 

373 

374 job.addProfile(Profile(Namespace.CONDOR, key="+bps_job_name", value=f'"{gwf_job.name}"')) 

375 job.addProfile(Profile(Namespace.CONDOR, key="+bps_job_label", value=f'"{gwf_job.label}"')) 

376 if "quanta_summary" in gwf_job.tags: 

377 job.addProfile(Profile(Namespace.CONDOR, key="+bps_job_quanta", 

378 value=f"\"{gwf_job.tags['quanta_summary']}\"")) 

379 

380 # Specify job's inputs. 

381 for gwf_file in generic_workflow.get_job_inputs(gwf_job.name, data=True, transfer_only=True): 

382 peg_file = peg_files[gwf_file.name] 

383 job.uses(peg_file, link=Link.INPUT) 

384 for pfn in peg_file.pfns: 

385 self.replica_catalog.add(peg_file.name, pfn.url, pfn.site) 

386 

387 # Specify job's outputs 

388 for gwf_file in generic_workflow.get_job_outputs(gwf_job.name, data=True, transfer_only=True): 

389 peg_file = peg_files[gwf_file.name] 

390 job.uses(peg_file, link=Link.OUTPUT) 

391 for pfn in peg_file.pfns: 

392 self.replica_catalog.add(peg_file.name, pfn.url, pfn.site) 

393 

394 return job 

395 

396 def _define_sites(self, out_prefix): 

397 """Create Pegasus Site Catalog 

398 

399 Parameters 

400 ---------- 

401 out_prefix : `str` 

402 Directory prefix for the site catalog file. 

403 

404 Notes 

405 ----- 

406 SitesCatalog needs workdir at initialization to create local site for 

407 submit side directory where the output data from the workflow will be 

408 stored. 

409 """ 

410 self.sites_catalog = sites_catalog.SitesCatalog(out_prefix, f"{self.name}_sites.xml") 

411 

412 # Adding information for all sites defined in config instead of 

413 # limiting to those actually used by the workflow 

414 for site, site_data in self.config["site"].items(): 

415 self.sites_catalog.add_site(site, arch=site_data["arch"], os=site_data["os"]) 

416 if "directory" in site_data: 

417 # Workaround because no Python API 

418 dir_dict = {} 

419 for site_dir in site_data["directory"]: 

420 dir_dict[site_dir] = {"path": site_data["directory"][site_dir]["path"]} 

421 self.sites_catalog._sites[site]["directories"] = dir_dict 

422 

423 # add config provided site attributes 

424 if "profile" in site_data: 

425 for pname, pdata in site_data["profile"].items(): 

426 for key, val in pdata.items(): 

427 self.sites_catalog.add_site_profile(site, namespace=pname, key=key, value=val) 

428 self.sites_catalog.add_site_profile(site, namespace=Namespace.DAGMAN, key="NODE_STATUS_FILE", 

429 value=f"{self.name}.node_status") 

430 

431 def write(self, out_prefix): 

432 """Write Pegasus Catalogs and DAX to files. 

433 

434 Parameters 

435 ---------- 

436 out_prefix : `str` 

437 Directory prefix for all the Pegasus workflow files. 

438 """ 

439 self.submit_path = out_prefix 

440 

441 # filenames needed for properties file 

442 filenames = {} 

443 

444 # Write down the workflow in DAX format. 

445 self.dax_filename = f"{self.dax.name}.dax" 

446 if out_prefix is not None: 

447 os.makedirs(out_prefix, exist_ok=True) 

448 self.dax_filename = os.path.join(out_prefix, self.dax_filename) 

449 with open(self.dax_filename, "w") as outfh: 

450 self.dax.writeXML(outfh) 

451 

452 # output site catalog 

453 filename = f"{self.name}_sites.xml" 

454 if "scFile" not in self.config: 

455 self._define_sites(out_prefix) 

456 self.sites_catalog.workflow_dir = out_prefix 

457 self.sites_catalog.filename = filename 

458 self.sites_catalog.write() 

459 else: 

460 shutil.copy(self.config["sitesFile"], os.path.join(self.submit_path, filename)) 

461 filenames["sites"] = filename 

462 

463 # output transformation catalog 

464 filename = f"{self.name}_tc.txt" 

465 if self.transformation_catalog is not None: 

466 self.transformation_catalog.workflow_dir = out_prefix 

467 self.transformation_catalog.filename = filename 

468 self.transformation_catalog.write() 

469 else: 

470 shutil.copy(self.config["tcFile"], os.path.join(self.submit_path, filename)) 

471 filenames["transformation"] = filename 

472 

473 # output replica catalog 

474 filename = f"{self.name}_rc.txt" 

475 if self.replica_catalog is not None: 

476 self.replica_catalog.workflow_dir = out_prefix 

477 self.replica_catalog.filename = filename 

478 self.replica_catalog.write() 

479 else: 

480 shutil.copy(self.config["tcFile"], os.path.join(self.submit_path, filename)) 

481 filenames["replica"] = filename 

482 

483 self.properties_filename = self._write_properties_file(out_prefix, filenames) 

484 

485 def run_pegasus_plan(self, out_prefix, run_attr): 

486 """Execute pegasus-plan to convert DAX to HTCondor DAG for submission. 

487 

488 Parameters 

489 ---------- 

490 out_prefix : `str` 

491 Root directory in which to output all files. 

492 run_attr : `dict` 

493 Attributes to add to main DAG. 

494 """ 

495 cmd = f"pegasus-plan --verbose --conf {self.properties_filename} --dax {self.dax_filename} --dir " \ 

496 f"{out_prefix}/peg --cleanup none --sites {self.config['computeSite']} " \ 

497 f"--input-dir {out_prefix}/input --output-dir {out_prefix}/output" 

498 _LOG.debug("Plan command: %s", cmd) 

499 pegout = f"{self.submit_path}/{self.name}_pegasus-plan.out" 

500 with chdir(self.submit_path): 

501 _LOG.debug("pegasus-plan in directory: %s", os.getcwd()) 

502 _LOG.debug("pegasus-plan output in %s", pegout) 

503 with open(pegout, "w") as pegfh: 

504 print(f"Command: {cmd}\n", file=pegfh) # Note: want blank line 

505 process = subprocess.run(shlex.split(cmd), shell=False, stdout=pegfh, 

506 stderr=subprocess.STDOUT, check=False) 

507 if process.returncode != 0: 

508 print(f"Error trying to generate Pegasus files. See {pegout}.") 

509 raise RuntimeError(f"pegasus-plan exited with non-zero exit code ({process.returncode})") 

510 

511 # Grab run id from pegasus-plan output and save 

512 with open(pegout, "r") as pegfh: 

513 for line in pegfh: 

514 match = re.search(r"pegasus-run\s+(\S+)", line) 

515 if match: 

516 self.run_id = match.group(1) 

517 break 

518 

519 # Hack - Using profile in sites.xml doesn't add run attributes to DAG 

520 # submission file. So adding them here: 

521 if run_attr is not None: 

522 subname = f"{self.run_id}/{self.name}-0.dag.condor.sub" 

523 shutil.copyfile(subname, subname + ".orig") 

524 with open(subname + ".orig", "r") as infh: 

525 with open(subname, "w") as outfh: 

526 for line in infh: 

527 line = line.strip() 

528 if line == "queue": 

529 htc_write_attribs(outfh, run_attr) 

530 htc_write_attribs(outfh, {"bps_job_label": "DAG"}) 

531 print(line, file=outfh) 

532 

533 def _write_properties_file(self, out_prefix, filenames): 

534 """Write Pegasus Properties File. 

535 

536 Parameters 

537 ---------- 

538 out_prefix : `str` 

539 Directory prefix for properties file. 

540 filenames : `dict` [`str`, `str`] 

541 Mapping of Pegasus file keys to filenames. 

542 

543 Returns 

544 ------- 

545 properties : `str` 

546 Filename of the pegasus properties file. 

547 """ 

548 properties = f"{self.name}_pegasus.properties" 

549 if out_prefix is not None: 

550 properties = os.path.join(out_prefix, properties) 

551 with open(properties, "w") as outfh: 

552 print("# This tells Pegasus where to find the Site Catalog.", file=outfh) 

553 print(f"pegasus.catalog.site.file={filenames['sites']}", file=outfh) 

554 

555 print("# This tells Pegasus where to find the Replica Catalog.", file=outfh) 

556 print(f"pegasus.catalog.replica.file={filenames['replica']}", file=outfh) 

557 

558 print("# This tells Pegasus where to find the Transformation Catalog.", file=outfh) 

559 print("pegasus.catalog.transformation=Text", file=outfh) 

560 print(f"pegasus.catalog.transformation.file={filenames['transformation']}", file=outfh) 

561 

562 print("# Run Pegasus in shared file system mode.", file=outfh) 

563 print("pegasus.data.configuration=sharedfs", file=outfh) 

564 

565 print("# Make Pegasus use links instead of transferring files.", file=outfh) 

566 print("pegasus.transfer.*.impl=Transfer", file=outfh) 

567 print("pegasus.transfer.links=true", file=outfh) 

568 

569 return properties