Coverage for python/lsst/ctrl/bps/panda/utils.py: 9%

192 statements  

« prev     ^ index     » next       coverage.py v7.3.0, created at 2023-08-22 09:52 +0000

1# This file is part of ctrl_bps_panda. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22"""Utilities for bps PanDA plugin.""" 

23 

24__all__ = [ 

25 "copy_files_for_distribution", 

26 "get_idds_client", 

27 "get_idds_result", 

28 "convert_exec_string_to_hex", 

29 "add_decoder_prefix", 

30] 

31 

32import binascii 

33import concurrent.futures 

34import logging 

35import os 

36 

37import idds.common.utils as idds_utils 

38import pandaclient.idds_api 

39from idds.doma.workflowv2.domapandawork import DomaPanDAWork 

40from idds.workflowv2.workflow import AndCondition 

41from lsst.ctrl.bps import BpsConfig, GenericWorkflow, GenericWorkflowJob 

42from lsst.ctrl.bps.panda.cmd_line_embedder import CommandLineEmbedder 

43from lsst.ctrl.bps.panda.constants import ( 

44 PANDA_DEFAULT_CLOUD, 

45 PANDA_DEFAULT_CORE_COUNT, 

46 PANDA_DEFAULT_MAX_ATTEMPTS, 

47 PANDA_DEFAULT_MAX_JOBS_PER_TASK, 

48 PANDA_DEFAULT_MAX_WALLTIME, 

49 PANDA_DEFAULT_PRIORITY, 

50 PANDA_DEFAULT_PROCESSING_TYPE, 

51 PANDA_DEFAULT_PROD_SOURCE_LABEL, 

52 PANDA_DEFAULT_RSS, 

53 PANDA_DEFAULT_TASK_TYPE, 

54 PANDA_DEFAULT_VO, 

55) 

56from lsst.resources import ResourcePath 

57 

58_LOG = logging.getLogger(__name__) 

59 

60 

61def copy_files_for_distribution(files_to_stage, file_distribution_uri, max_copy_workers): 

62 """Brings locally generated files into Cloud for further 

63 utilization them on the edge nodes. 

64 

65 Parameters 

66 ---------- 

67 local_pfns : `dict` [`str`, `str`] 

68 Files which need to be copied to a workflow staging area. 

69 file_distribution_uri: `str` 

70 Path on the edge node accessed storage, 

71 including access protocol, bucket name to place files. 

72 max_copy_workers : `int` 

73 Maximum number of workers for copying files. 

74 

75 Raises 

76 ------ 

77 RuntimeError 

78 Raised when error copying files to the distribution point. 

79 """ 

80 files_to_copy = {} 

81 

82 # In case there are folders we iterate over its content 

83 for local_pfn in files_to_stage.values(): 

84 folder_name = os.path.basename(os.path.normpath(local_pfn)) 

85 if os.path.isdir(local_pfn): 

86 files_in_folder = ResourcePath.findFileResources([local_pfn]) 

87 for file in files_in_folder: 

88 file_name = file.basename() 

89 files_to_copy[file] = ResourcePath( 

90 os.path.join(file_distribution_uri, folder_name, file_name) 

91 ) 

92 else: 

93 files_to_copy[ResourcePath(local_pfn)] = ResourcePath( 

94 os.path.join(file_distribution_uri, folder_name) 

95 ) 

96 

97 copy_executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_copy_workers) 

98 future_file_copy = [] 

99 for src, trgt in files_to_copy.items(): 

100 _LOG.debug("Staging %s to %s", src, trgt) 

101 # S3 clients explicitly instantiate here to overpass this 

102 # https://stackoverflow.com/questions/52820971/is-boto3-client-thread-safe 

103 trgt.exists() 

104 future_file_copy.append(copy_executor.submit(trgt.transfer_from, src, transfer="copy")) 

105 

106 for future in concurrent.futures.as_completed(future_file_copy): 

107 if future.result() is not None: 

108 raise RuntimeError("Error of placing files to the distribution point") 

109 

110 

111def get_idds_client(config): 

112 """Get the idds client. 

113 

114 Parameters 

115 ---------- 

116 config : `lsst.ctrl.bps.BpsConfig` 

117 BPS configuration. 

118 

119 Returns 

120 ------- 

121 idds_client: `idds.client.clientmanager.ClientManager` 

122 iDDS ClientManager object. 

123 """ 

124 idds_server = None 

125 if isinstance(config, BpsConfig): 

126 _, idds_server = config.search("iddsServer", opt={"default": None}) 

127 elif isinstance(config, dict) and "iddsServer" in config: 

128 idds_server = config["iddsServer"] 

129 # if idds_server is None, a default value on the panda relay service 

130 # will be used 

131 idds_client = pandaclient.idds_api.get_api( 

132 idds_utils.json_dumps, idds_host=idds_server, compress=True, manager=True 

133 ) 

134 return idds_client 

135 

136 

137def get_idds_result(ret): 

138 """Parse the results returned from iDDS. 

139 

140 Parameters 

141 ---------- 

142 ret: `tuple` of (`int`, (`bool`, payload)). 

143 The first part ret[0] is the status of PanDA relay service. 

144 The part of ret[1][0] is the status of iDDS service. 

145 The part of ret[1][1] is the returned payload. 

146 If ret[1][0] is False, ret[1][1] can be error messages. 

147 

148 Returns 

149 ------- 

150 status: `bool` 

151 The status of iDDS calls. 

152 result: `int` or `list` or `dict` 

153 The result returned from iDDS. 

154 error: `str` 

155 Error messages. 

156 """ 

157 # https://panda-wms.readthedocs.io/en/latest/client/rest_idds.html 

158 if not isinstance(ret, list | tuple) or ret[0] != 0: 

159 # Something wrong with the PanDA relay service. 

160 # The call may not be delivered to iDDS. 

161 status = False 

162 result = None 

163 error = f"PanDA relay service returns errors: {str(ret)}" 

164 else: 

165 if ret[1][0]: 

166 status = True 

167 result = ret[1][1] 

168 error = None 

169 if isinstance(result, str) and "Authentication no permission" in result: 

170 status = False 

171 result = None 

172 error = result 

173 else: 

174 # iDDS returns errors 

175 status = False 

176 result = None 

177 error = f"iDDS returns errors: {str(ret[1][1])}" 

178 return status, result, error 

179 

180 

181def _make_pseudo_filename(config, gwjob): 

182 """Make the job pseudo filename. 

183 

184 Parameters 

185 ---------- 

186 config : `lsst.ctrl.bps.BpsConfig` 

187 BPS configuration. 

188 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

189 Job for which to create the pseudo filename. 

190 

191 Returns 

192 ------- 

193 pseudo_filename : `str` 

194 The pseudo filename for the given job. 

195 """ 

196 cmd_line_embedder = CommandLineEmbedder(config) 

197 _, pseudo_filename = cmd_line_embedder.substitute_command_line( 

198 gwjob.executable.src_uri + " " + gwjob.arguments, gwjob.cmdvals, gwjob.name, [] 

199 ) 

200 return pseudo_filename 

201 

202 

203def _make_doma_work(config, generic_workflow, gwjob, task_count, task_chunk): 

204 """Make the DOMA Work object for a PanDA task. 

205 

206 Parameters 

207 ---------- 

208 config : `lsst.ctrl.bps.BpsConfig` 

209 BPS configuration. 

210 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

211 Job representing the jobs for the PanDA task. 

212 task_count : `int` 

213 Count of PanDA tasks used when making unique names. 

214 task_chunk : `int` 

215 Count of chunk of a PanDA tasks used when making unique names. 

216 

217 Returns 

218 ------- 

219 work : `idds.doma.workflowv2.domapandawork.DomaPanDAWork` 

220 The client representation of a PanDA task. 

221 local_pfns : `dict` [`str`, `str`] 

222 Files which need to be copied to a workflow staging area. 

223 """ 

224 _LOG.debug("Using gwjob %s to create new PanDA task (gwjob=%s)", gwjob.name, gwjob) 

225 cvals = {"curr_cluster": gwjob.label} 

226 _, site = config.search("computeSite", opt={"curvals": cvals, "required": True}) 

227 cvals["curr_site"] = site 

228 _, processing_type = config.search( 

229 "processing_type", opt={"curvals": cvals, "default": PANDA_DEFAULT_PROCESSING_TYPE} 

230 ) 

231 _, task_type = config.search("taskType", opt={"curvals": cvals, "default": PANDA_DEFAULT_TASK_TYPE}) 

232 _, prod_source_label = config.search( 

233 "prodSourceLabel", opt={"curvals": cvals, "default": PANDA_DEFAULT_PROD_SOURCE_LABEL} 

234 ) 

235 _, vo = config.search("vo", opt={"curvals": cvals, "default": PANDA_DEFAULT_VO}) 

236 

237 _, file_distribution_end_point = config.search( 

238 "fileDistributionEndPoint", opt={"curvals": cvals, "default": None} 

239 ) 

240 

241 _, file_distribution_end_point_default = config.search( 

242 "fileDistributionEndPointDefault", opt={"curvals": cvals, "default": None} 

243 ) 

244 

245 # Assume input files are same across task 

246 local_pfns = {} 

247 direct_io_files = set() 

248 

249 if gwjob.executable.transfer_executable: 

250 local_pfns["job_executable"] = gwjob.executable.src_uri 

251 job_executable = f"./{os.path.basename(gwjob.executable.src_uri)}" 

252 else: 

253 job_executable = gwjob.executable.src_uri 

254 cmd_line_embedder = CommandLineEmbedder(config) 

255 _LOG.debug( 

256 "job %s inputs = %s, outputs = %s", 

257 gwjob.name, 

258 generic_workflow.get_job_inputs(gwjob.name), 

259 generic_workflow.get_job_outputs(gwjob.name), 

260 ) 

261 

262 cmd_line, _ = cmd_line_embedder.substitute_command_line( 

263 job_executable + " " + gwjob.arguments, 

264 gwjob.cmdvals, 

265 gwjob.name, 

266 generic_workflow.get_job_inputs(gwjob.name) + generic_workflow.get_job_outputs(gwjob.name), 

267 ) 

268 

269 for gwfile in generic_workflow.get_job_inputs(gwjob.name, transfer_only=True): 

270 local_pfns[gwfile.name] = gwfile.src_uri 

271 if os.path.isdir(gwfile.src_uri): 

272 # this is needed to make isdir function working 

273 # properly in ButlerURL instance on the edge node 

274 local_pfns[gwfile.name] += "/" 

275 

276 if gwfile.job_access_remote: 

277 direct_io_files.add(gwfile.name) 

278 

279 if not direct_io_files: 

280 direct_io_files.add("cmdlineplaceholder") 

281 

282 lsst_temp = "LSST_RUN_TEMP_SPACE" 

283 if lsst_temp in file_distribution_end_point and lsst_temp not in os.environ: 

284 file_distribution_end_point = file_distribution_end_point_default 

285 

286 executable = add_decoder_prefix( 

287 config, cmd_line, file_distribution_end_point, (local_pfns, direct_io_files) 

288 ) 

289 work = DomaPanDAWork( 

290 executable=executable, 

291 primary_input_collection={ 

292 "scope": "pseudo_dataset", 

293 "name": f"pseudo_input_collection#{str(task_count)}", 

294 }, 

295 output_collections=[ 

296 {"scope": "pseudo_dataset", "name": f"pseudo_output_collection#{str(task_count)}"} 

297 ], 

298 log_collections=[], 

299 dependency_map=[], 

300 task_name=f"{generic_workflow.name}_{task_count:02d}_{gwjob.label}_{task_chunk:02d}", 

301 task_queue=gwjob.queue, 

302 task_log={ 

303 "destination": "local", 

304 "value": "log.tgz", 

305 "dataset": "PandaJob_#{pandaid}/", 

306 "token": "local", 

307 "param_type": "log", 

308 "type": "template", 

309 }, 

310 encode_command_line=True, 

311 task_rss=gwjob.request_memory if gwjob.request_memory else PANDA_DEFAULT_RSS, 

312 task_cloud=gwjob.compute_cloud if gwjob.compute_cloud else PANDA_DEFAULT_CLOUD, 

313 task_site=site, 

314 task_priority=int(gwjob.priority) if gwjob.priority else PANDA_DEFAULT_PRIORITY, 

315 core_count=gwjob.request_cpus if gwjob.request_cpus else PANDA_DEFAULT_CORE_COUNT, 

316 working_group=gwjob.accounting_group, 

317 processing_type=processing_type, 

318 task_type=task_type, 

319 prodSourceLabel=prod_source_label, 

320 vo=vo, 

321 maxattempt=gwjob.number_of_retries if gwjob.number_of_retries else PANDA_DEFAULT_MAX_ATTEMPTS, 

322 maxwalltime=gwjob.request_walltime if gwjob.request_walltime else PANDA_DEFAULT_MAX_WALLTIME, 

323 ) 

324 return work, local_pfns 

325 

326 

327def add_final_idds_work( 

328 config, generic_workflow, idds_client_workflow, dag_sink_work, task_count, task_chunk 

329): 

330 """Add the special final PanDA task to the client workflow. 

331 

332 Parameters 

333 ---------- 

334 config : `lsst.ctrl.bps.BpsConfig` 

335 BPS configuration. 

336 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

337 Generic workflow in which to find the final job. 

338 idds_client_workflow : `idds.workflowv2.workflow.Workflow` 

339 iDDS client representation of the workflow to which the final task 

340 is added. 

341 dag_sink_work : `list` [`idds.doma.workflowv2.domapandawork.DomaPanDAWork`] 

342 The work nodes in the client workflow which have no successors. 

343 task_count : `int` 

344 Count of PanDA tasks used when making unique names. 

345 task_chunk : `int` 

346 Count of chunk of a PanDA tasks used when making unique names. 

347 

348 Returns 

349 ------- 

350 files : `dict` [`str`, `str`] 

351 Files which need to be copied to a workflow staging area. 

352 

353 Raises 

354 ------ 

355 NotImplementedError 

356 Raised if final job in GenericWorkflow is itself a workflow. 

357 TypeError 

358 Raised if final job in GenericWorkflow is invalid type. 

359 """ 

360 files = {} 

361 

362 # If final job exists in generic workflow, create DAG final job 

363 final = generic_workflow.get_final() 

364 if final: 

365 if isinstance(final, GenericWorkflow): 

366 raise NotImplementedError("PanDA plugin does not support a workflow as the final job") 

367 

368 if not isinstance(final, GenericWorkflowJob): 

369 raise TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})") 

370 

371 dag_final_work, files = _make_doma_work( 

372 config, 

373 generic_workflow, 

374 final, 

375 task_count, 

376 task_chunk, 

377 ) 

378 pseudo_filename = "pure_pseudoinput+qgraphNodeId:+qgraphId:" 

379 dag_final_work.dependency_map.append( 

380 {"name": pseudo_filename, "submitted": False, "dependencies": []} 

381 ) 

382 idds_client_workflow.add_work(dag_final_work) 

383 conditions = [] 

384 for work in dag_sink_work: 

385 conditions.append(work.is_terminated) 

386 and_cond = AndCondition(conditions=conditions, true_works=[dag_final_work]) 

387 idds_client_workflow.add_condition(and_cond) 

388 else: 

389 _LOG.debug("No final job in GenericWorkflow") 

390 return files 

391 

392 

393def convert_exec_string_to_hex(cmdline): 

394 """Convert the command line into hex representation. 

395 

396 This step is currently involved because large blocks of command lines 

397 including special symbols passed to the pilot/container. To make sure 

398 the 1 to 1 matching and pass by the special symbol stripping 

399 performed by the Pilot we applied the hexing. 

400 

401 Parameters 

402 ---------- 

403 cmdline : `str` 

404 UTF-8 command line string 

405 

406 Returns 

407 ------- 

408 hex : `str` 

409 Hex representation of string 

410 """ 

411 return binascii.hexlify(cmdline.encode()).decode("utf-8") 

412 

413 

414def add_decoder_prefix(config, cmd_line, distribution_path, files): 

415 """Compose the command line sent to the pilot from the functional part 

416 (the actual SW running) and the middleware part (containers invocation) 

417 

418 Parameters 

419 ---------- 

420 config : `lsst.ctrl.bps.BpsConfig` 

421 Configuration information 

422 cmd_line : `str` 

423 UTF-8 based functional part of the command line 

424 distribution_path : `str` 

425 URI of path where all files are located for distribution 

426 files : `tuple` [`dict` [`str`, `str`], `list` [`str`]] 

427 File names needed for a task (copied local, direct access) 

428 

429 Returns 

430 ------- 

431 decoder_prefix : `str` 

432 Full command line to be executed on the edge node 

433 """ 

434 # Manipulate file paths for placement on cmdline 

435 files_plc_hldr = {} 

436 for key, pfn in files[0].items(): 

437 if pfn.endswith("/"): 

438 files_plc_hldr[key] = os.path.basename(pfn[:-1]) 

439 isdir = True 

440 else: 

441 files_plc_hldr[key] = os.path.basename(pfn) 

442 _, extension = os.path.splitext(pfn) 

443 isdir = os.path.isdir(pfn) or (key == "butlerConfig" and extension != "yaml") 

444 if isdir: 

445 # this is needed to make isdir function working 

446 # properly in ButlerURL instance on the egde node 

447 files_plc_hldr[key] += "/" 

448 _LOG.debug("files_plc_hldr[%s] = %s", key, files_plc_hldr[key]) 

449 

450 cmdline_hex = convert_exec_string_to_hex(cmd_line) 

451 _, runner_command = config.search("runnerCommand", opt={"replaceEnvVars": False, "expandEnvVars": False}) 

452 runner_command = runner_command.replace("\n", " ") 

453 decoder_prefix = runner_command.replace( 

454 "_cmd_line_", 

455 str(cmdline_hex) 

456 + " ${IN/L} " 

457 + distribution_path 

458 + " " 

459 + "+".join(f"{k}:{v}" for k, v in files_plc_hldr.items()) 

460 + " " 

461 + "+".join(files[1]), 

462 ) 

463 return decoder_prefix 

464 

465 

466def add_idds_work(config, generic_workflow, idds_workflow): 

467 """Convert GenericWorkflowJobs to iDDS work and add them to the iDDS 

468 workflow. 

469 

470 Parameters 

471 ---------- 

472 config : `lsst.ctrl.bps.BpsConfig` 

473 BPS configuration 

474 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

475 Generic workflow containing jobs to convert. 

476 idds_workflow : `idds.workflowv2.workflow.Workflow` 

477 iDDS workflow to which the converted jobs should be added. 

478 

479 Returns 

480 ------- 

481 files_to_pre_stage : `dict` [`str`, `str`] 

482 Files that need to be copied to the staging area before submission. 

483 dag_sink_work : `list` [`idds.doma.workflowv2.domapandawork.DomaPanDAWork`] 

484 The work nodes in the client workflow which have no successors. 

485 task_count : `int` 

486 Number of tasks in iDDS workflow used for unique task names 

487 

488 Raises 

489 ------ 

490 RuntimeError 

491 If cannot recover from dependency issues after pass through workflow. 

492 """ 

493 # Limit number of jobs in single PanDA task 

494 _, max_jobs_per_task = config.search("maxJobsPerTask", opt={"default": PANDA_DEFAULT_MAX_JOBS_PER_TASK}) 

495 

496 files_to_pre_stage = {} 

497 dag_sink_work = [] # Workflow sink nodes that need to be connected to final task 

498 job_to_task = {} 

499 job_to_pseudo_filename = {} 

500 task_count = 0 # Task number/ID in idds workflow used for unique name 

501 

502 # To avoid dying due to optimizing number of times through workflow, 

503 # catch dependency issues to loop through again later. 

504 jobs_with_dependency_issues = {} 

505 

506 # Assume jobs with same label share config values 

507 for job_label in generic_workflow.labels: 

508 _LOG.debug("job_label = %s", job_label) 

509 # Add each job with a particular label to a corresponding PanDA task 

510 # A PanDA task has a limit on number of jobs, so break into multiple 

511 # PanDA tasks if needed. 

512 job_count = 0 # Number of jobs in idds task used for task chunking 

513 task_chunk = 1 # Task chunk number within job label used for unique name 

514 work = None 

515 

516 # Instead of changing code to make chunks up front and round-robin 

517 # assign jobs to chunks, for now keeping chunk creation in loop 

518 # but using knowledge of how many chunks there will be to set better 

519 # maximum number of jobs in a chunk for more even distribution. 

520 jobs_by_label = generic_workflow.get_jobs_by_label(job_label) 

521 num_chunks = -(-len(jobs_by_label) // max_jobs_per_task) # ceil 

522 max_jobs_per_task_this_label = -(-len(jobs_by_label) // num_chunks) 

523 _LOG.debug( 

524 "For job_label = %s, num jobs = %s, num_chunks = %s, max_jobs = %s", 

525 job_label, 

526 len(jobs_by_label), 

527 num_chunks, 

528 max_jobs_per_task_this_label, 

529 ) 

530 for gwjob in jobs_by_label: 

531 job_count += 1 

532 if job_count > max_jobs_per_task_this_label: 

533 job_count = 1 

534 task_chunk += 1 

535 

536 if job_count == 1: 

537 # Create new PanDA task object 

538 task_count += 1 

539 work, files = _make_doma_work(config, generic_workflow, gwjob, task_count, task_chunk) 

540 files_to_pre_stage.update(files) 

541 idds_workflow.add_work(work) 

542 if generic_workflow.out_degree(gwjob.name) == 0: 

543 dag_sink_work.append(work) 

544 

545 pseudo_filename = _make_pseudo_filename(config, gwjob) 

546 job_to_pseudo_filename[gwjob.name] = pseudo_filename 

547 job_to_task[gwjob.name] = work.get_work_name() 

548 deps = [] 

549 missing_deps = False 

550 for parent_job_name in generic_workflow.predecessors(gwjob.name): 

551 if parent_job_name not in job_to_task: 

552 _LOG.debug("job_to_task.keys() = %s", job_to_task.keys()) 

553 missing_deps = True 

554 break 

555 else: 

556 deps.append( 

557 { 

558 "task": job_to_task[parent_job_name], 

559 "inputname": job_to_pseudo_filename[parent_job_name], 

560 "available": False, 

561 } 

562 ) 

563 if not missing_deps: 

564 work.dependency_map.append({"name": pseudo_filename, "dependencies": deps}) 

565 else: 

566 jobs_with_dependency_issues[gwjob.name] = work 

567 

568 # If there were any issues figuring out dependencies through earlier loop 

569 if jobs_with_dependency_issues: 

570 _LOG.warning("Could not prepare workflow in single pass. Please notify developers.") 

571 _LOG.info("Trying to recover...") 

572 for job_name, work in jobs_with_dependency_issues.items(): 

573 deps = [] 

574 for parent_job_name in generic_workflow.predecessors(job_name): 

575 if parent_job_name not in job_to_task: 

576 _LOG.debug("job_to_task.keys() = %s", job_to_task.keys()) 

577 raise RuntimeError( 

578 "Could not recover from dependency issues ({job_name} missing {parent_job_name})." 

579 ) 

580 deps.append( 

581 { 

582 "task": job_to_task[parent_job_name], 

583 "inputname": job_to_pseudo_filename[parent_job_name], 

584 "available": False, 

585 } 

586 ) 

587 pseudo_filename = job_to_pseudo_filename[job_name] 

588 work.dependency_map.append({"name": pseudo_filename, "dependencies": deps}) 

589 _LOG.info("Successfully recovered.") 

590 

591 return files_to_pre_stage, dag_sink_work, task_count