Coverage for python/lsst/ctrl/bps/panda/panda_service.py: 14%

180 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-11-30 10:55 +0000

1# This file is part of ctrl_bps_panda. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22 

23__all__ = ["PanDAService", "PandaBpsWmsWorkflow"] 

24 

25 

26import binascii 

27import concurrent.futures 

28import json 

29import logging 

30import os 

31 

32import idds.common.utils as idds_utils 

33import pandaclient.idds_api 

34from idds.doma.workflowv2.domapandawork import DomaPanDAWork 

35from idds.workflowv2.workflow import AndCondition 

36from idds.workflowv2.workflow import Workflow as IDDS_client_workflow 

37from lsst.ctrl.bps.bps_config import BpsConfig 

38from lsst.ctrl.bps.panda.idds_tasks import IDDSWorkflowGenerator 

39from lsst.ctrl.bps.wms_service import BaseWmsService, BaseWmsWorkflow 

40from lsst.resources import ResourcePath 

41 

42_LOG = logging.getLogger(__name__) 

43 

44 

45class PanDAService(BaseWmsService): 

46 """PanDA version of WMS service""" 

47 

48 def prepare(self, config, generic_workflow, out_prefix=None): 

49 """Convert generic workflow to an PanDA iDDS ready for submission 

50 

51 Parameters 

52 ---------- 

53 config : `lsst.ctrl.bps.BpsConfig` 

54 BPS configuration that includes necessary submit/runtime 

55 information. 

56 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

57 out_prefix : `str` 

58 The root directory into which all WMS-specific files are written 

59 

60 Returns 

61 ------- 

62 workflow : `lsst.ctrl.bps.panda.panda_service.PandaBpsWmsWorkflow` 

63 PanDA workflow ready to be run. 

64 """ 

65 _LOG.debug("out_prefix = '%s'", out_prefix) 

66 workflow = PandaBpsWmsWorkflow.from_generic_workflow( 

67 config, generic_workflow, out_prefix, f"{self.__class__.__module__}." f"{self.__class__.__name__}" 

68 ) 

69 workflow.write(out_prefix) 

70 return workflow 

71 

72 def convert_exec_string_to_hex(self, cmdline): 

73 """Convert the command line into hex representation. 

74 

75 This step is currently involved because large blocks of command lines 

76 including special symbols passed to the pilot/container. To make sure 

77 the 1 to 1 matching and pass by the special symbol stripping 

78 performed by the Pilot we applied the hexing. 

79 

80 Parameters 

81 ---------- 

82 cmdline : `str` 

83 UTF-8 command line string 

84 

85 Returns 

86 ------- 

87 hex : `str` 

88 Hex representation of string 

89 """ 

90 return binascii.hexlify(cmdline.encode()).decode("utf-8") 

91 

92 def add_decoder_prefix(self, cmd_line, distribution_path, files): 

93 """ 

94 Compose the command line sent to the pilot from the functional part 

95 (the actual SW running) and the middleware part (containers invocation) 

96 

97 Parameters 

98 ---------- 

99 cmd_line : `str` 

100 UTF-8 based functional part of the command line 

101 distribution_path : `str` 

102 URI of path where all files are located for distribution 

103 files `list` [`str`] 

104 File names needed for a task 

105 

106 Returns 

107 ------- 

108 decoder_prefix : `str` 

109 Full command line to be executed on the edge node 

110 """ 

111 

112 cmdline_hex = self.convert_exec_string_to_hex(cmd_line) 

113 _, decoder_prefix = self.config.search( 

114 "runnerCommand", opt={"replaceEnvVars": False, "expandEnvVars": False} 

115 ) 

116 decoder_prefix = decoder_prefix.replace( 

117 "_cmd_line_", 

118 str(cmdline_hex) 

119 + " ${IN/L} " 

120 + distribution_path 

121 + " " 

122 + "+".join(f"{k}:{v}" for k, v in files[0].items()) 

123 + " " 

124 + "+".join(files[1]), 

125 ) 

126 return decoder_prefix 

127 

128 def submit(self, workflow): 

129 """Submit a single PanDA iDDS workflow 

130 

131 Parameters 

132 ---------- 

133 workflow : `lsst.ctrl.bps.BaseWorkflow` 

134 A single PanDA iDDS workflow to submit 

135 """ 

136 idds_client_workflow = IDDS_client_workflow(name=workflow.name) 

137 files = self.copy_files_for_distribution( 

138 workflow.generated_tasks, self.config["fileDistributionEndPoint"] 

139 ) 

140 DAG_end_work = [] 

141 DAG_final_work = None 

142 

143 _, processing_type = self.config.search("processing_type", opt={"default": None}) 

144 _, task_type = self.config.search("task_type", opt={"default": "test"}) 

145 _, prod_source_label = self.config.search("prodSourceLabel", opt={"default": None}) 

146 _, vo = self.config.search("vo", opt={"default": "wlcg"}) 

147 

148 for idx, task in enumerate(workflow.generated_tasks): 

149 work = DomaPanDAWork( 

150 executable=self.add_decoder_prefix( 

151 task.executable, self.config["fileDistributionEndPoint"], files 

152 ), 

153 primary_input_collection={ 

154 "scope": "pseudo_dataset", 

155 "name": "pseudo_input_collection#" + str(idx), 

156 }, 

157 output_collections=[ 

158 {"scope": "pseudo_dataset", "name": "pseudo_output_collection#" + str(idx)} 

159 ], 

160 log_collections=[], 

161 dependency_map=task.dependencies, 

162 task_name=task.name, 

163 task_queue=task.queue, 

164 task_log={ 

165 "destination": "local", 

166 "value": "log.tgz", 

167 "dataset": "PandaJob_#{pandaid}/", 

168 "token": "local", 

169 "param_type": "log", 

170 "type": "template", 

171 }, 

172 encode_command_line=True, 

173 task_rss=task.max_rss, 

174 task_cloud=task.cloud, 

175 task_site=task.site, 

176 task_priority=int(task.priority) if task.priority else 900, 

177 core_count=task.core_count, 

178 working_group=task.working_group, 

179 processing_type=processing_type, 

180 task_type=task_type, 

181 prodSourceLabel=prod_source_label if prod_source_label else task.prod_source_label, 

182 vo=vo, 

183 maxattempt=task.max_attempt, 

184 maxwalltime=task.max_walltime if task.max_walltime else 90000, 

185 ) 

186 

187 idds_client_workflow.add_work(work) 

188 if task.is_final: 

189 DAG_final_work = work 

190 if task.is_dag_end: 

191 DAG_end_work.append(work) 

192 

193 if DAG_final_work: 

194 conditions = [] 

195 for work in DAG_end_work: 

196 conditions.append(work.is_terminated) 

197 and_cond = AndCondition(conditions=conditions, true_works=[DAG_final_work]) 

198 idds_client_workflow.add_condition(and_cond) 

199 idds_client = self.get_idds_client() 

200 ret = idds_client.submit(idds_client_workflow, username=None, use_dataset_name=False) 

201 _LOG.debug("iDDS client manager submit returned = %s", ret) 

202 

203 # Check submission success 

204 status, result, error = self.get_idds_result(ret) 

205 if status: 

206 request_id = int(result) 

207 else: 

208 raise RuntimeError(f"Error submitting to PanDA service: {error}") 

209 

210 _LOG.info("Submitted into iDDs with request id=%s", request_id) 

211 workflow.run_id = request_id 

212 

213 @staticmethod 

214 def copy_files_for_distribution(tasks, file_distribution_uri): 

215 """ 

216 Brings locally generated files into Cloud for further 

217 utilization them on the edge nodes. 

218 

219 Parameters 

220 ---------- 

221 local_pfns: `list` of `tasks` 

222 Tasks that input files needs to be placed for 

223 distribution 

224 file_distribution_uri: `str` 

225 Path on the edge node accessed storage, 

226 including access protocol, bucket name to place files 

227 

228 Returns 

229 ------- 

230 files_plc_hldr, direct_IO_files : `dict` [`str`, `str`], `set` of `str` 

231 First parameters is key values pairs 

232 of file placeholder - file name 

233 Second parameter is set of files which will be directly accessed. 

234 """ 

235 local_pfns = {} 

236 direct_IO_files = set() 

237 for task in tasks: 

238 for file in task.files_used_by_task: 

239 if not file.delivered: 

240 local_pfns[file.name] = file.submission_url 

241 if file.direct_IO: 

242 direct_IO_files.add(file.name) 

243 

244 files_to_copy = {} 

245 

246 # In case there are folders we iterate over its content 

247 for local_pfn in local_pfns.values(): 

248 folder_name = os.path.basename(local_pfn) 

249 if os.path.isdir(local_pfn): 

250 files_in_folder = ResourcePath.findFileResources([local_pfn]) 

251 for file in files_in_folder: 

252 file_name = file.basename() 

253 files_to_copy[file] = ResourcePath( 

254 os.path.join(file_distribution_uri, folder_name, file_name) 

255 ) 

256 else: 

257 files_to_copy[ResourcePath(local_pfn)] = ResourcePath( 

258 os.path.join(file_distribution_uri, folder_name) 

259 ) 

260 

261 copy_executor = concurrent.futures.ThreadPoolExecutor(max_workers=10) 

262 future_file_copy = [] 

263 for src, trgt in files_to_copy.items(): 

264 

265 # S3 clients explicitly instantiate here to overpass this 

266 # https://stackoverflow.com/questions/52820971/is-boto3-client-thread-safe 

267 trgt.exists() 

268 future_file_copy.append(copy_executor.submit(trgt.transfer_from, src, transfer="copy")) 

269 for future in concurrent.futures.as_completed(future_file_copy): 

270 if not future.result() is None: 

271 raise RuntimeError("Error of placing files to the distribution point") 

272 

273 if len(direct_IO_files) == 0: 

274 direct_IO_files.add("cmdlineplaceholder") 

275 

276 files_plc_hldr = {} 

277 for file_placeholder, src_path in local_pfns.items(): 

278 files_plc_hldr[file_placeholder] = os.path.basename(src_path) 

279 if os.path.isdir(src_path): 

280 # this is needed to make isdir function working 

281 # properly in ButlerURL instance on the egde node 

282 files_plc_hldr[file_placeholder] += "/" 

283 

284 return files_plc_hldr, direct_IO_files 

285 

286 def get_idds_client(self): 

287 """Get the idds client 

288 

289 Returns 

290 ------- 

291 idds_client: `idds.client.clientmanager.ClientManager` 

292 iDDS ClientManager object. 

293 """ 

294 idds_server = None 

295 if isinstance(self.config, BpsConfig): 

296 _, idds_server = self.config.search("iddsServer", opt={"default": None}) 

297 elif isinstance(self.config, dict) and "iddsServer" in self.config: 

298 idds_server = self.config["iddsServer"] 

299 # if idds_server is None, a default value on the panda relay service 

300 # will be used 

301 idds_client = pandaclient.idds_api.get_api( 

302 idds_utils.json_dumps, idds_host=idds_server, compress=True, manager=True 

303 ) 

304 return idds_client 

305 

306 def get_idds_result(self, ret): 

307 """Parse the results returned from iDDS. 

308 

309 Parameters 

310 ---------- 

311 ret: `tuple` of (`int`, (`bool`, payload)). 

312 The first part ret[0] is the status of PanDA relay service. 

313 The part of ret[1][0] is the status of iDDS service. 

314 The part of ret[1][1] is the returned payload. 

315 If ret[1][0] is False, ret[1][1] can be error messages. 

316 

317 Returns 

318 ------- 

319 status: `bool` 

320 The status of iDDS calls. 

321 result: `int` or `list` or `dict` 

322 The result returned from iDDS. 

323 error: `str` 

324 Error messages. 

325 """ 

326 # https://panda-wms.readthedocs.io/en/latest/client/rest_idds.html 

327 if not (isinstance(ret, tuple) or isinstance(ret, list)) or ret[0] != 0: 

328 # Something wrong with the PanDA relay service. 

329 # The call may not be delivered to iDDS. 

330 status = False 

331 result = None 

332 error = "PanDA relay service returns errors: %s" % str(ret) 

333 else: 

334 if ret[1][0]: 

335 status = True 

336 result = ret[1][1] 

337 error = None 

338 if isinstance(result, str) and "Authentication no permission" in result: 

339 status = False 

340 result = None 

341 error = result 

342 else: 

343 # iDDS returns errors 

344 status = False 

345 result = None 

346 error = "iDDS returns errors: %s" % str(ret[1][1]) 

347 return status, result, error 

348 

349 def restart(self, wms_workflow_id): 

350 """Restart a workflow from the point of failure. 

351 

352 Parameters 

353 ---------- 

354 wms_workflow_id : `str` 

355 Id that can be used by WMS service to identify workflow that 

356 need to be restarted. 

357 

358 Returns 

359 ------- 

360 wms_id : `str` 

361 Id of the restarted workflow. If restart failed, it will be set 

362 to `None`. 

363 run_name : `str` 

364 Name of the restarted workflow. If restart failed, it will be set 

365 to `None`. 

366 message : `str` 

367 A message describing any issues encountered during the restart. 

368 If there were no issue, an empty string is returned. 

369 """ 

370 idds_client = self.get_idds_client() 

371 ret = idds_client.retry(request_id=wms_workflow_id) 

372 _LOG.debug("Restart PanDA workflow returned = %s", ret) 

373 

374 status, result, error = self.get_idds_result(ret) 

375 if status: 

376 _LOG.info("Restarting PanDA workflow %s", result) 

377 return wms_workflow_id, None, json.dumps(result) 

378 else: 

379 return None, None, "Error retry PanDA workflow: %s" % str(error) 

380 

381 def report(self, wms_workflow_id=None, user=None, hist=0, pass_thru=None, is_global=False): 

382 """Stub for future implementation of the report method 

383 Expected to return run information based upon given constraints. 

384 

385 Parameters 

386 ---------- 

387 wms_workflow_id : `int` or `str` 

388 Limit to specific run based on id. 

389 user : `str` 

390 Limit results to runs for this user. 

391 hist : `float` 

392 Limit history search to this many days. 

393 pass_thru : `str` 

394 Constraints to pass through to HTCondor. 

395 is_global : `bool`, optional 

396 If set, all available job queues will be queried for job 

397 information. Defaults to False which means that only a local job 

398 queue will be queried for information. 

399 

400 Returns 

401 ------- 

402 runs : `list` [`lsst.ctrl.bps.WmsRunReport`] 

403 Information about runs from given job information. 

404 message : `str` 

405 Extra message for report command to print. This could be 

406 pointers to documentation or to WMS specific commands. 

407 """ 

408 raise NotImplementedError 

409 

410 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False): 

411 """Query WMS for list of submitted WMS workflows/jobs. 

412 

413 This should be a quick lookup function to create list of jobs for 

414 other functions. 

415 

416 Parameters 

417 ---------- 

418 wms_id : `int` or `str`, optional 

419 Id or path that can be used by WMS service to look up job. 

420 user : `str`, optional 

421 User whose submitted jobs should be listed. 

422 require_bps : `bool`, optional 

423 Whether to require jobs returned in list to be bps-submitted jobs. 

424 pass_thru : `str`, optional 

425 Information to pass through to WMS. 

426 is_global : `bool`, optional 

427 If set, all available job queues will be queried for job 

428 information. Defaults to False which means that only a local job 

429 queue will be queried for information. 

430 

431 Only applicable in the context of a WMS using distributed job 

432 queues (e.g., HTCondor). A WMS with a centralized job queue 

433 (e.g. PanDA) can safely ignore it. 

434 

435 Returns 

436 ------- 

437 req_ids : `list` [`Any`] 

438 Only job ids to be used by cancel and other functions. Typically 

439 this means top-level jobs (i.e., not children jobs). 

440 """ 

441 if wms_id is None and user is not None: 

442 raise RuntimeError( 

443 "Error to get workflow status report: wms_id is required" 

444 " and filtering workflows with 'user' is not supported." 

445 ) 

446 

447 idds_client = self.get_idds_client() 

448 ret = idds_client.get_requests(request_id=wms_id) 

449 _LOG.debug("PanDA get workflows returned = %s", ret) 

450 

451 status, result, error = self.get_idds_result(ret) 

452 if status: 

453 req_ids = [req["request_id"] for req in result] 

454 return req_ids 

455 else: 

456 raise RuntimeError(f"Error list PanDA workflow requests: {error}") 

457 

458 def cancel(self, wms_id, pass_thru=None): 

459 """Cancel submitted workflows/jobs. 

460 

461 Parameters 

462 ---------- 

463 wms_id : `str` 

464 ID or path of job that should be canceled. 

465 pass_thru : `str`, optional 

466 Information to pass through to WMS. 

467 

468 Returns 

469 ------- 

470 deleted : `bool` 

471 Whether successful deletion or not. Currently, if any doubt or any 

472 individual jobs not deleted, return False. 

473 message : `str` 

474 Any message from WMS (e.g., error details). 

475 """ 

476 idds_client = self.get_idds_client() 

477 ret = idds_client.abort(request_id=wms_id) 

478 _LOG.debug("Abort PanDA workflow returned = %s", ret) 

479 

480 status, result, error = self.get_idds_result(ret) 

481 if status: 

482 _LOG.info("Aborting PanDA workflow %s", result) 

483 return True, json.dumps(result) 

484 else: 

485 return False, "Error abort PanDA workflow: %s" % str(error) 

486 

487 def ping(self, pass_thru=None): 

488 """Checks whether PanDA WMS services are up, reachable, 

489 and can authenticate if authentication is required. 

490 

491 The services to be checked are those needed for submit, report, cancel, 

492 restart, but ping cannot guarantee whether jobs would actually run 

493 successfully. Any messages should be sent directly to the logger. 

494 

495 Parameters 

496 ---------- 

497 pass_thru : `str`, optional 

498 Information to pass through to WMS. 

499 

500 Returns 

501 ------- 

502 status : `int` 

503 0 for success, non-zero for failure 

504 message : `str` 

505 Any message from WMS (e.g., error details). 

506 """ 

507 idds_client = self.get_idds_client() 

508 ret = idds_client.ping() 

509 _LOG.debug("Ping PanDA service returned = %s", ret) 

510 

511 status, result, error = self.get_idds_result(ret) 

512 if status: 

513 if "Status" in result and result["Status"] == "OK": 

514 return 0, None 

515 else: 

516 return -1, "Error ping PanDA service: %s" % str(result) 

517 else: 

518 return -1, "Error ping PanDA service: %s" % str(error) 

519 

520 def run_submission_checks(self): 

521 """Checks to run at start if running WMS specific submission steps. 

522 

523 Any exception other than NotImplementedError will halt submission. 

524 Submit directory may not yet exist when this is called. 

525 """ 

526 for key in ["PANDA_URL"]: 

527 if key not in os.environ: 

528 raise OSError(f"Missing environment variable {key}") 

529 

530 status, message = self.ping() 

531 if status != 0: 

532 raise RuntimeError(message) 

533 

534 

535class PandaBpsWmsWorkflow(BaseWmsWorkflow): 

536 """A single Panda based workflow 

537 

538 Parameters 

539 ---------- 

540 name : `str` 

541 Unique name for Workflow 

542 config : `lsst.ctrl.bps.BpsConfig` 

543 BPS configuration that includes necessary submit/runtime information 

544 """ 

545 

546 def __init__(self, name, config=None): 

547 super().__init__(name, config) 

548 self.generated_tasks = None 

549 

550 @classmethod 

551 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): 

552 # Docstring inherited from parent class 

553 idds_workflow = cls(generic_workflow.name, config) 

554 workflow_generator = IDDSWorkflowGenerator(generic_workflow, config) 

555 idds_workflow.generated_tasks = workflow_generator.define_tasks() 

556 _LOG.debug("panda dag attribs %s", generic_workflow.run_attrs) 

557 return idds_workflow 

558 

559 def write(self, out_prefix): 

560 """Not yet implemented"""