Coverage for python/lsst/ctrl/bps/panda/utils.py: 9%

186 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-18 02:24 -0700

1# This file is part of ctrl_bps_panda. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22"""Utilities for bps PanDA plugin.""" 

23 

24__all__ = [ 

25 "copy_files_for_distribution", 

26 "get_idds_client", 

27 "get_idds_result", 

28 "convert_exec_string_to_hex", 

29 "add_decoder_prefix", 

30] 

31 

32import binascii 

33import concurrent.futures 

34import logging 

35import os 

36 

37import idds.common.utils as idds_utils 

38import pandaclient.idds_api 

39from idds.doma.workflowv2.domapandawork import DomaPanDAWork 

40from idds.workflowv2.workflow import AndCondition 

41from lsst.ctrl.bps import BpsConfig, GenericWorkflow, GenericWorkflowJob 

42from lsst.ctrl.bps.panda.cmd_line_embedder import CommandLineEmbedder 

43from lsst.ctrl.bps.panda.constants import ( 

44 PANDA_DEFAULT_CLOUD, 

45 PANDA_DEFAULT_CORE_COUNT, 

46 PANDA_DEFAULT_MAX_ATTEMPTS, 

47 PANDA_DEFAULT_MAX_JOBS_PER_TASK, 

48 PANDA_DEFAULT_MAX_WALLTIME, 

49 PANDA_DEFAULT_PRIORITY, 

50 PANDA_DEFAULT_PROCESSING_TYPE, 

51 PANDA_DEFAULT_PROD_SOURCE_LABEL, 

52 PANDA_DEFAULT_RSS, 

53 PANDA_DEFAULT_TASK_TYPE, 

54 PANDA_DEFAULT_VO, 

55) 

56from lsst.resources import ResourcePath 

57 

58_LOG = logging.getLogger(__name__) 

59 

60 

61def copy_files_for_distribution(files_to_stage, file_distribution_uri, max_copy_workers): 

62 """Brings locally generated files into Cloud for further 

63 utilization them on the edge nodes. 

64 

65 Parameters 

66 ---------- 

67 local_pfns : `dict` [`str`, `str`] 

68 Files which need to be copied to a workflow staging area. 

69 file_distribution_uri: `str` 

70 Path on the edge node accessed storage, 

71 including access protocol, bucket name to place files. 

72 max_copy_workers : `int` 

73 Maximum number of workers for copying files. 

74 

75 Raises 

76 ------ 

77 RuntimeError 

78 Raised when error copying files to the distribution point. 

79 """ 

80 files_to_copy = {} 

81 

82 # In case there are folders we iterate over its content 

83 for local_pfn in files_to_stage.values(): 

84 folder_name = os.path.basename(os.path.normpath(local_pfn)) 

85 if os.path.isdir(local_pfn): 

86 files_in_folder = ResourcePath.findFileResources([local_pfn]) 

87 for file in files_in_folder: 

88 file_name = file.basename() 

89 files_to_copy[file] = ResourcePath( 

90 os.path.join(file_distribution_uri, folder_name, file_name) 

91 ) 

92 else: 

93 files_to_copy[ResourcePath(local_pfn)] = ResourcePath( 

94 os.path.join(file_distribution_uri, folder_name) 

95 ) 

96 

97 copy_executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_copy_workers) 

98 future_file_copy = [] 

99 for src, trgt in files_to_copy.items(): 

100 # S3 clients explicitly instantiate here to overpass this 

101 # https://stackoverflow.com/questions/52820971/is-boto3-client-thread-safe 

102 trgt.exists() 

103 future_file_copy.append(copy_executor.submit(trgt.transfer_from, src, transfer="copy")) 

104 

105 for future in concurrent.futures.as_completed(future_file_copy): 

106 if not future.result() is None: 

107 raise RuntimeError("Error of placing files to the distribution point") 

108 

109 

110def get_idds_client(config): 

111 """Get the idds client. 

112 

113 Parameters 

114 ---------- 

115 config : `lsst.ctrl.bps.BpsConfig` 

116 BPS configuration. 

117 

118 Returns 

119 ------- 

120 idds_client: `idds.client.clientmanager.ClientManager` 

121 iDDS ClientManager object. 

122 """ 

123 idds_server = None 

124 if isinstance(config, BpsConfig): 

125 _, idds_server = config.search("iddsServer", opt={"default": None}) 

126 elif isinstance(config, dict) and "iddsServer" in config: 

127 idds_server = config["iddsServer"] 

128 # if idds_server is None, a default value on the panda relay service 

129 # will be used 

130 idds_client = pandaclient.idds_api.get_api( 

131 idds_utils.json_dumps, idds_host=idds_server, compress=True, manager=True 

132 ) 

133 return idds_client 

134 

135 

136def get_idds_result(ret): 

137 """Parse the results returned from iDDS. 

138 

139 Parameters 

140 ---------- 

141 ret: `tuple` of (`int`, (`bool`, payload)). 

142 The first part ret[0] is the status of PanDA relay service. 

143 The part of ret[1][0] is the status of iDDS service. 

144 The part of ret[1][1] is the returned payload. 

145 If ret[1][0] is False, ret[1][1] can be error messages. 

146 

147 Returns 

148 ------- 

149 status: `bool` 

150 The status of iDDS calls. 

151 result: `int` or `list` or `dict` 

152 The result returned from iDDS. 

153 error: `str` 

154 Error messages. 

155 """ 

156 # https://panda-wms.readthedocs.io/en/latest/client/rest_idds.html 

157 if not isinstance(ret, (list, tuple)) or ret[0] != 0: 

158 # Something wrong with the PanDA relay service. 

159 # The call may not be delivered to iDDS. 

160 status = False 

161 result = None 

162 error = f"PanDA relay service returns errors: {str(ret)}" 

163 else: 

164 if ret[1][0]: 

165 status = True 

166 result = ret[1][1] 

167 error = None 

168 if isinstance(result, str) and "Authentication no permission" in result: 

169 status = False 

170 result = None 

171 error = result 

172 else: 

173 # iDDS returns errors 

174 status = False 

175 result = None 

176 error = f"iDDS returns errors: {str(ret[1][1])}" 

177 return status, result, error 

178 

179 

180def _make_pseudo_filename(config, gwjob): 

181 """Make the job pseudo filename. 

182 

183 Parameters 

184 ---------- 

185 config : `lsst.ctrl.bps.BpsConfig` 

186 BPS configuration. 

187 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

188 Job for which to create the pseudo filename. 

189 

190 Returns 

191 ------- 

192 pseudo_filename : `str` 

193 The pseudo filename for the given job. 

194 """ 

195 cmd_line_embedder = CommandLineEmbedder(config) 

196 _, pseudo_filename = cmd_line_embedder.substitute_command_line( 

197 gwjob.executable.src_uri + " " + gwjob.arguments, gwjob.cmdvals, gwjob.name 

198 ) 

199 return pseudo_filename 

200 

201 

202def _make_doma_work(config, generic_workflow, gwjob, task_count, task_chunk): 

203 """Make the DOMA Work object for a PanDA task. 

204 

205 Parameters 

206 ---------- 

207 config : `lsst.ctrl.bps.BpsConfig` 

208 BPS configuration. 

209 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

210 Job representing the jobs for the PanDA task. 

211 task_count : `int` 

212 Count of PanDA tasks used when making unique names. 

213 task_chunk : `int` 

214 Count of chunk of a PanDA tasks used when making unique names. 

215 

216 Returns 

217 ------- 

218 work : `idds.doma.workflowv2.domapandawork.DomaPanDAWork` 

219 The client representation of a PanDA task. 

220 local_pfns : `dict` [`str`, `str`] 

221 Files which need to be copied to a workflow staging area. 

222 """ 

223 _LOG.debug("Using gwjob %s to create new PanDA task (gwjob=%s)", gwjob.name, gwjob) 

224 cvals = {"curr_cluster": gwjob.label} 

225 _, site = config.search("computeSite", opt={"curvals": cvals, "required": True}) 

226 cvals["curr_site"] = site 

227 _, processing_type = config.search( 

228 "processing_type", opt={"curvals": cvals, "default": PANDA_DEFAULT_PROCESSING_TYPE} 

229 ) 

230 _, task_type = config.search("taskType", opt={"curvals": cvals, "default": PANDA_DEFAULT_TASK_TYPE}) 

231 _, prod_source_label = config.search( 

232 "prodSourceLabel", opt={"curvals": cvals, "default": PANDA_DEFAULT_PROD_SOURCE_LABEL} 

233 ) 

234 _, vo = config.search("vo", opt={"curvals": cvals, "default": PANDA_DEFAULT_VO}) 

235 

236 _, file_distribution_end_point = config.search( 

237 "fileDistributionEndPoint", opt={"curvals": cvals, "default": None} 

238 ) 

239 

240 # Assume input files are same across task 

241 local_pfns = {} 

242 direct_io_files = set() 

243 

244 if gwjob.executable.transfer_executable: 

245 local_pfns["job_executable"] = gwjob.executable.src_uri 

246 job_executable = f"./{os.path.basename(gwjob.executable.src_uri)}" 

247 else: 

248 job_executable = gwjob.executable.src_uri 

249 cmd_line_embedder = CommandLineEmbedder(config) 

250 cmd_line, _ = cmd_line_embedder.substitute_command_line( 

251 job_executable + " " + gwjob.arguments, gwjob.cmdvals, gwjob.name 

252 ) 

253 

254 for gwfile in generic_workflow.get_job_inputs(gwjob.name, transfer_only=True): 

255 local_pfns[gwfile.name] = gwfile.src_uri 

256 if os.path.isdir(gwfile.src_uri): 

257 # this is needed to make isdir function working 

258 # properly in ButlerURL instance on the edge node 

259 local_pfns[gwfile.name] += "/" 

260 

261 if gwfile.job_access_remote: 

262 direct_io_files.add(gwfile.name) 

263 

264 if not direct_io_files: 

265 direct_io_files.add("cmdlineplaceholder") 

266 

267 executable = add_decoder_prefix( 

268 config, cmd_line, file_distribution_end_point, (local_pfns, direct_io_files) 

269 ) 

270 work = DomaPanDAWork( 

271 executable=executable, 

272 primary_input_collection={ 

273 "scope": "pseudo_dataset", 

274 "name": f"pseudo_input_collection#{str(task_count)}", 

275 }, 

276 output_collections=[ 

277 {"scope": "pseudo_dataset", "name": f"pseudo_output_collection#{str(task_count)}"} 

278 ], 

279 log_collections=[], 

280 dependency_map=[], 

281 task_name=f"{generic_workflow.name}_{task_count:02d}_{gwjob.label}_{task_chunk:02d}", 

282 task_queue=gwjob.queue, 

283 task_log={ 

284 "destination": "local", 

285 "value": "log.tgz", 

286 "dataset": "PandaJob_#{pandaid}/", 

287 "token": "local", 

288 "param_type": "log", 

289 "type": "template", 

290 }, 

291 encode_command_line=True, 

292 task_rss=gwjob.request_memory if gwjob.request_memory else PANDA_DEFAULT_RSS, 

293 task_cloud=gwjob.compute_cloud if gwjob.compute_cloud else PANDA_DEFAULT_CLOUD, 

294 task_site=site, 

295 task_priority=int(gwjob.priority) if gwjob.priority else PANDA_DEFAULT_PRIORITY, 

296 core_count=gwjob.request_cpus if gwjob.request_cpus else PANDA_DEFAULT_CORE_COUNT, 

297 working_group=gwjob.accounting_group, 

298 processing_type=processing_type, 

299 task_type=task_type, 

300 prodSourceLabel=prod_source_label, 

301 vo=vo, 

302 maxattempt=gwjob.number_of_retries if gwjob.number_of_retries else PANDA_DEFAULT_MAX_ATTEMPTS, 

303 maxwalltime=gwjob.request_walltime if gwjob.request_walltime else PANDA_DEFAULT_MAX_WALLTIME, 

304 ) 

305 return work, local_pfns 

306 

307 

308def add_final_idds_work( 

309 config, generic_workflow, idds_client_workflow, dag_sink_work, task_count, task_chunk 

310): 

311 """Add the special final PanDA task to the client workflow. 

312 

313 Parameters 

314 ---------- 

315 config : `lsst.ctrl.bps.BpsConfig` 

316 BPS configuration. 

317 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

318 Generic workflow in which to find the final job. 

319 idds_client_workflow : `idds.workflowv2.workflow.Workflow` 

320 iDDS client representation of the workflow to which the final task 

321 is added. 

322 dag_sink_work : `list` [`idds.doma.workflowv2.domapandawork.DomaPanDAWork`] 

323 The work nodes in the client workflow which have no successors. 

324 task_count : `int` 

325 Count of PanDA tasks used when making unique names. 

326 task_chunk : `int` 

327 Count of chunk of a PanDA tasks used when making unique names. 

328 

329 Returns 

330 ------- 

331 files : `dict` [`str`, `str`] 

332 Files which need to be copied to a workflow staging area. 

333 

334 Raises 

335 ------ 

336 NotImplementedError 

337 Raised if final job in GenericWorkflow is itself a workflow. 

338 TypeError 

339 Raised if final job in GenericWorkflow is invalid type. 

340 """ 

341 files = {} 

342 

343 # If final job exists in generic workflow, create DAG final job 

344 final = generic_workflow.get_final() 

345 if final: 

346 if isinstance(final, GenericWorkflow): 

347 raise NotImplementedError("PanDA plugin does not support a workflow as the final job") 

348 

349 if not isinstance(final, GenericWorkflowJob): 

350 raise TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})") 

351 

352 dag_final_work, files = _make_doma_work( 

353 config, 

354 generic_workflow, 

355 final, 

356 task_count, 

357 task_chunk, 

358 ) 

359 pseudo_filename = "pure_pseudoinput+qgraphNodeId:+qgraphId:" 

360 dag_final_work.dependency_map.append( 

361 {"name": pseudo_filename, "submitted": False, "dependencies": []} 

362 ) 

363 idds_client_workflow.add_work(dag_final_work) 

364 conditions = [] 

365 for work in dag_sink_work: 

366 conditions.append(work.is_terminated) 

367 and_cond = AndCondition(conditions=conditions, true_works=[dag_final_work]) 

368 idds_client_workflow.add_condition(and_cond) 

369 else: 

370 _LOG.debug("No final job in GenericWorkflow") 

371 return files 

372 

373 

374def convert_exec_string_to_hex(cmdline): 

375 """Convert the command line into hex representation. 

376 

377 This step is currently involved because large blocks of command lines 

378 including special symbols passed to the pilot/container. To make sure 

379 the 1 to 1 matching and pass by the special symbol stripping 

380 performed by the Pilot we applied the hexing. 

381 

382 Parameters 

383 ---------- 

384 cmdline : `str` 

385 UTF-8 command line string 

386 

387 Returns 

388 ------- 

389 hex : `str` 

390 Hex representation of string 

391 """ 

392 return binascii.hexlify(cmdline.encode()).decode("utf-8") 

393 

394 

395def add_decoder_prefix(config, cmd_line, distribution_path, files): 

396 """Compose the command line sent to the pilot from the functional part 

397 (the actual SW running) and the middleware part (containers invocation) 

398 

399 Parameters 

400 ---------- 

401 config : `lsst.ctrl.bps.BpsConfig` 

402 Configuration information 

403 cmd_line : `str` 

404 UTF-8 based functional part of the command line 

405 distribution_path : `str` 

406 URI of path where all files are located for distribution 

407 files : `tuple` [`dict` [`str`, `str`], `list` [`str`]] 

408 File names needed for a task (copied local, direct access) 

409 

410 Returns 

411 ------- 

412 decoder_prefix : `str` 

413 Full command line to be executed on the edge node 

414 """ 

415 # Manipulate file paths for placement on cmdline 

416 files_plc_hldr = {} 

417 for key, pfn in files[0].items(): 

418 if pfn.endswith("/"): 

419 files_plc_hldr[key] = os.path.basename(pfn[:-1]) 

420 isdir = True 

421 else: 

422 files_plc_hldr[key] = os.path.basename(pfn) 

423 _, extension = os.path.splitext(pfn) 

424 isdir = os.path.isdir(pfn) or (key == "butlerConfig" and extension != "yaml") 

425 if isdir: 

426 # this is needed to make isdir function working 

427 # properly in ButlerURL instance on the egde node 

428 files_plc_hldr[key] += "/" 

429 _LOG.debug("files_plc_hldr[%s] = %s", key, files_plc_hldr[key]) 

430 

431 cmdline_hex = convert_exec_string_to_hex(cmd_line) 

432 _, runner_command = config.search("runnerCommand", opt={"replaceEnvVars": False, "expandEnvVars": False}) 

433 runner_command = runner_command.replace("\n", " ") 

434 decoder_prefix = runner_command.replace( 

435 "_cmd_line_", 

436 str(cmdline_hex) 

437 + " ${IN/L} " 

438 + distribution_path 

439 + " " 

440 + "+".join(f"{k}:{v}" for k, v in files_plc_hldr.items()) 

441 + " " 

442 + "+".join(files[1]), 

443 ) 

444 return decoder_prefix 

445 

446 

447def add_idds_work(config, generic_workflow, idds_workflow): 

448 """Convert GenericWorkflowJobs to iDDS work and add them to the iDDS 

449 workflow. 

450 

451 Parameters 

452 ---------- 

453 config : `lsst.ctrl.bps.BpsConfig` 

454 BPS configuration 

455 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

456 Generic workflow containing jobs to convert. 

457 idds_workflow : `idds.workflowv2.workflow.Workflow` 

458 iDDS workflow to which the converted jobs should be added. 

459 

460 Returns 

461 ------- 

462 files_to_pre_stage : `dict` [`str`, `str`] 

463 Files that need to be copied to the staging area before submission. 

464 dag_sink_work : `list` [`idds.doma.workflowv2.domapandawork.DomaPanDAWork`] 

465 The work nodes in the client workflow which have no successors. 

466 task_count : `int` 

467 Number of tasks in iDDS workflow used for unique task names 

468 

469 Raises 

470 ------ 

471 RuntimeError 

472 If cannot recover from dependency issues after pass through workflow. 

473 """ 

474 # Limit number of jobs in single PanDA task 

475 _, max_jobs_per_task = config.search("maxJobsPerTask", opt={"default": PANDA_DEFAULT_MAX_JOBS_PER_TASK}) 

476 

477 files_to_pre_stage = {} 

478 dag_sink_work = [] # Workflow sink nodes that need to be connected to final task 

479 job_to_task = {} 

480 job_to_pseudo_filename = {} 

481 task_count = 0 # Task number/ID in idds workflow used for unique name 

482 

483 # To avoid dying due to optimizing number of times through workflow, 

484 # catch dependency issues to loop through again later. 

485 jobs_with_dependency_issues = {} 

486 

487 # Assume jobs with same label share config values 

488 for job_label in generic_workflow.labels: 

489 _LOG.debug("job_label = %s", job_label) 

490 # Add each job with a particular label to a corresponding PanDA task 

491 # A PanDA task has a limit on number of jobs, so break into multiple 

492 # PanDA tasks if needed. 

493 job_count = 0 # Number of jobs in idds task used for task chunking 

494 task_chunk = 1 # Task chunk number within job label used for unique name 

495 work = None 

496 

497 # Instead of changing code to make chunks up front and round-robin 

498 # assign jobs to chunks, for now keeping chunk creation in loop 

499 # but using knowledge of how many chunks there will be to set better 

500 # maximum number of jobs in a chunk for more even distribution. 

501 jobs_by_label = generic_workflow.get_jobs_by_label(job_label) 

502 num_chunks = -(-len(jobs_by_label) // max_jobs_per_task) # ceil 

503 max_jobs_per_task_this_label = -(-len(jobs_by_label) // num_chunks) 

504 _LOG.debug( 

505 "For job_label = %s, num jobs = %s, num_chunks = %s, max_jobs = %s", 

506 job_label, 

507 len(jobs_by_label), 

508 num_chunks, 

509 max_jobs_per_task_this_label, 

510 ) 

511 for gwjob in jobs_by_label: 

512 job_count += 1 

513 if job_count > max_jobs_per_task_this_label: 

514 job_count = 1 

515 task_chunk += 1 

516 

517 if job_count == 1: 

518 # Create new PanDA task object 

519 task_count += 1 

520 work, files = _make_doma_work(config, generic_workflow, gwjob, task_count, task_chunk) 

521 files_to_pre_stage.update(files) 

522 idds_workflow.add_work(work) 

523 if generic_workflow.out_degree(gwjob.name) == 0: 

524 dag_sink_work.append(work) 

525 

526 pseudo_filename = _make_pseudo_filename(config, gwjob) 

527 job_to_pseudo_filename[gwjob.name] = pseudo_filename 

528 job_to_task[gwjob.name] = work.get_work_name() 

529 deps = [] 

530 missing_deps = False 

531 for parent_job_name in generic_workflow.predecessors(gwjob.name): 

532 if parent_job_name not in job_to_task: 

533 _LOG.debug("job_to_task.keys() = %s", job_to_task.keys()) 

534 missing_deps = True 

535 break 

536 else: 

537 deps.append( 

538 { 

539 "task": job_to_task[parent_job_name], 

540 "inputname": job_to_pseudo_filename[parent_job_name], 

541 "available": False, 

542 } 

543 ) 

544 if not missing_deps: 

545 work.dependency_map.append({"name": pseudo_filename, "dependencies": deps}) 

546 else: 

547 jobs_with_dependency_issues[gwjob.name] = work 

548 

549 # If there were any issues figuring out dependencies through earlier loop 

550 if jobs_with_dependency_issues: 

551 _LOG.warning("Could not prepare workflow in single pass. Please notify developers.") 

552 _LOG.info("Trying to recover...") 

553 for job_name, work in jobs_with_dependency_issues.items(): 

554 deps = [] 

555 for parent_job_name in generic_workflow.predecessors(job_name): 

556 if parent_job_name not in job_to_task: 

557 _LOG.debug("job_to_task.keys() = %s", job_to_task.keys()) 

558 raise RuntimeError( 

559 "Could not recover from dependency issues ({job_name} missing {parent_job_name})." 

560 ) 

561 deps.append( 

562 { 

563 "task": job_to_task[parent_job_name], 

564 "inputname": job_to_pseudo_filename[parent_job_name], 

565 "available": False, 

566 } 

567 ) 

568 pseudo_filename = job_to_pseudo_filename[job_name] 

569 work.dependency_map.append({"name": pseudo_filename, "dependencies": deps}) 

570 _LOG.info("Successfully recovered.") 

571 

572 return files_to_pre_stage, dag_sink_work, task_count