Coverage for python/lsst/ctrl/bps/panda/utils.py: 9%

266 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-05-01 03:49 -0700

1# This file is part of ctrl_bps_panda. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <https://www.gnu.org/licenses/>. 

27 

28"""Utilities for bps PanDA plugin.""" 

29 

30__all__ = [ 

31 "copy_files_for_distribution", 

32 "get_idds_client", 

33 "get_idds_result", 

34 "convert_exec_string_to_hex", 

35 "add_decoder_prefix", 

36] 

37 

38import binascii 

39import concurrent.futures 

40import logging 

41import os 

42import tarfile 

43import uuid 

44 

45import idds.common.utils as idds_utils 

46import pandaclient.idds_api 

47from idds.doma.workflowv2.domapandawork import DomaPanDAWork 

48from idds.workflowv2.workflow import AndCondition 

49from idds.workflowv2.workflow import Workflow as IDDS_client_workflow 

50from lsst.ctrl.bps import BpsConfig, GenericWorkflow, GenericWorkflowJob 

51from lsst.ctrl.bps.panda.cmd_line_embedder import CommandLineEmbedder 

52from lsst.ctrl.bps.panda.constants import ( 

53 PANDA_DEFAULT_CLOUD, 

54 PANDA_DEFAULT_CORE_COUNT, 

55 PANDA_DEFAULT_MAX_ATTEMPTS, 

56 PANDA_DEFAULT_MAX_JOBS_PER_TASK, 

57 PANDA_DEFAULT_MAX_WALLTIME, 

58 PANDA_DEFAULT_PRIORITY, 

59 PANDA_DEFAULT_PROCESSING_TYPE, 

60 PANDA_DEFAULT_PROD_SOURCE_LABEL, 

61 PANDA_DEFAULT_RSS, 

62 PANDA_DEFAULT_RSS_MAX, 

63 PANDA_DEFAULT_TASK_TYPE, 

64 PANDA_DEFAULT_VO, 

65) 

66from lsst.resources import ResourcePath 

67 

68_LOG = logging.getLogger(__name__) 

69 

70 

71def copy_files_for_distribution(files_to_stage, file_distribution_uri, max_copy_workers): 

72 """Brings locally generated files into Cloud for further 

73 utilization them on the edge nodes. 

74 

75 Parameters 

76 ---------- 

77 files_to_stage : `dict` [`str`, `str`] 

78 Files which need to be copied to a workflow staging area. 

79 file_distribution_uri : `str` 

80 Path on the edge node accessed storage, 

81 including access protocol, bucket name to place files. 

82 max_copy_workers : `int` 

83 Maximum number of workers for copying files. 

84 

85 Raises 

86 ------ 

87 RuntimeError 

88 Raised when error copying files to the distribution point. 

89 """ 

90 files_to_copy = {} 

91 

92 # In case there are folders we iterate over its content 

93 for local_pfn in files_to_stage.values(): 

94 folder_name = os.path.basename(os.path.normpath(local_pfn)) 

95 if os.path.isdir(local_pfn): 

96 files_in_folder = ResourcePath.findFileResources([local_pfn]) 

97 for file in files_in_folder: 

98 file_name = file.basename() 

99 files_to_copy[file] = ResourcePath( 

100 os.path.join(file_distribution_uri, folder_name, file_name) 

101 ) 

102 else: 

103 files_to_copy[ResourcePath(local_pfn)] = ResourcePath( 

104 os.path.join(file_distribution_uri, folder_name) 

105 ) 

106 

107 copy_executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_copy_workers) 

108 future_file_copy = [] 

109 for src, trgt in files_to_copy.items(): 

110 _LOG.debug("Staging %s to %s", src, trgt) 

111 # S3 clients explicitly instantiate here to overpass this 

112 # https://stackoverflow.com/questions/52820971/is-boto3-client-thread-safe 

113 trgt.exists() 

114 future_file_copy.append(copy_executor.submit(trgt.transfer_from, src, transfer="copy")) 

115 

116 for future in concurrent.futures.as_completed(future_file_copy): 

117 if future.result() is not None: 

118 raise RuntimeError("Error of placing files to the distribution point") 

119 

120 

121def get_idds_client(config): 

122 """Get the idds client. 

123 

124 Parameters 

125 ---------- 

126 config : `lsst.ctrl.bps.BpsConfig` 

127 BPS configuration. 

128 

129 Returns 

130 ------- 

131 idds_client: `idds.client.clientmanager.ClientManager` 

132 The iDDS ClientManager object. 

133 """ 

134 idds_server = None 

135 if isinstance(config, BpsConfig): 

136 _, idds_server = config.search("iddsServer", opt={"default": None}) 

137 elif isinstance(config, dict) and "iddsServer" in config: 

138 idds_server = config["iddsServer"] 

139 # if idds_server is None, a default value on the panda relay service 

140 # will be used 

141 idds_client = pandaclient.idds_api.get_api( 

142 idds_utils.json_dumps, idds_host=idds_server, compress=True, manager=True 

143 ) 

144 return idds_client 

145 

146 

147def get_idds_result(ret): 

148 """Parse the results returned from iDDS. 

149 

150 Parameters 

151 ---------- 

152 ret : `tuple` [`int`, `tuple` [`bool`, payload ]] 

153 The first part ``ret[0]`` is the status of PanDA relay service. 

154 The part of ``ret[1][0]`` is the status of iDDS service. 

155 The part of ``ret[1][1]`` is the returned payload. 

156 If ``ret[1][0]`` is `False`, ``ret[1][1]`` can be error messages. 

157 

158 Returns 

159 ------- 

160 status: `bool` 

161 The status of iDDS calls. 

162 result: `int` or `list` or `dict` or `None` 

163 The result returned from iDDS. `None` if error state. 

164 error: `str` or `None` 

165 Error messages. `None` if no error state. 

166 """ 

167 # https://panda-wms.readthedocs.io/en/latest/client/rest_idds.html 

168 if not isinstance(ret, list | tuple) or ret[0] != 0: 

169 # Something wrong with the PanDA relay service. 

170 # The call may not be delivered to iDDS. 

171 status = False 

172 result = None 

173 error = f"PanDA relay service returns errors: {str(ret)}" 

174 else: 

175 if ret[1][0]: 

176 status = True 

177 result = ret[1][1] 

178 error = None 

179 if isinstance(result, str) and "Authentication no permission" in result: 

180 status = False 

181 result = None 

182 error = result 

183 else: 

184 # iDDS returns errors 

185 status = False 

186 result = None 

187 error = f"iDDS returns errors: {str(ret[1][1])}" 

188 return status, result, error 

189 

190 

191def _make_pseudo_filename(config, gwjob): 

192 """Make the job pseudo filename. 

193 

194 Parameters 

195 ---------- 

196 config : `lsst.ctrl.bps.BpsConfig` 

197 BPS configuration. 

198 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

199 Job for which to create the pseudo filename. 

200 

201 Returns 

202 ------- 

203 pseudo_filename : `str` 

204 The pseudo filename for the given job. 

205 """ 

206 cmd_line_embedder = CommandLineEmbedder(config) 

207 _, pseudo_filename = cmd_line_embedder.substitute_command_line( 

208 gwjob.executable.src_uri + " " + gwjob.arguments, gwjob.cmdvals, gwjob.name, [] 

209 ) 

210 return pseudo_filename 

211 

212 

213def _make_doma_work(config, generic_workflow, gwjob, task_count, task_chunk): 

214 """Make the DOMA Work object for a PanDA task. 

215 

216 Parameters 

217 ---------- 

218 config : `lsst.ctrl.bps.BpsConfig` 

219 BPS configuration. 

220 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

221 Job representing the jobs for the PanDA task. 

222 task_count : `int` 

223 Count of PanDA tasks used when making unique names. 

224 task_chunk : `int` 

225 Count of chunk of a PanDA tasks used when making unique names. 

226 

227 Returns 

228 ------- 

229 work : `idds.doma.workflowv2.domapandawork.DomaPanDAWork` 

230 The client representation of a PanDA task. 

231 local_pfns : `dict` [`str`, `str`] 

232 Files which need to be copied to a workflow staging area. 

233 """ 

234 _LOG.debug("Using gwjob %s to create new PanDA task (gwjob=%s)", gwjob.name, gwjob) 

235 cvals = {"curr_cluster": gwjob.label} 

236 _, site = config.search("computeSite", opt={"curvals": cvals, "required": True}) 

237 cvals["curr_site"] = site 

238 _, processing_type = config.search( 

239 "processing_type", opt={"curvals": cvals, "default": PANDA_DEFAULT_PROCESSING_TYPE} 

240 ) 

241 _, task_type = config.search("taskType", opt={"curvals": cvals, "default": PANDA_DEFAULT_TASK_TYPE}) 

242 _, prod_source_label = config.search( 

243 "prodSourceLabel", opt={"curvals": cvals, "default": PANDA_DEFAULT_PROD_SOURCE_LABEL} 

244 ) 

245 _, vo = config.search("vo", opt={"curvals": cvals, "default": PANDA_DEFAULT_VO}) 

246 

247 _, file_distribution_end_point = config.search( 

248 "fileDistributionEndPoint", opt={"curvals": cvals, "default": None} 

249 ) 

250 

251 _, file_distribution_end_point_default = config.search( 

252 "fileDistributionEndPointDefault", opt={"curvals": cvals, "default": None} 

253 ) 

254 

255 task_rss = gwjob.request_memory if gwjob.request_memory else PANDA_DEFAULT_RSS 

256 task_rss_retry_step = task_rss * gwjob.memory_multiplier if gwjob.memory_multiplier else 0 

257 task_rss_retry_offset = 0 if task_rss_retry_step else task_rss 

258 

259 # Assume input files are same across task 

260 local_pfns = {} 

261 direct_io_files = set() 

262 

263 if gwjob.executable.transfer_executable: 

264 local_pfns["job_executable"] = gwjob.executable.src_uri 

265 job_executable = f"./{os.path.basename(gwjob.executable.src_uri)}" 

266 else: 

267 job_executable = gwjob.executable.src_uri 

268 cmd_line_embedder = CommandLineEmbedder(config) 

269 _LOG.debug( 

270 "job %s inputs = %s, outputs = %s", 

271 gwjob.name, 

272 generic_workflow.get_job_inputs(gwjob.name), 

273 generic_workflow.get_job_outputs(gwjob.name), 

274 ) 

275 

276 cmd_line, _ = cmd_line_embedder.substitute_command_line( 

277 job_executable + " " + gwjob.arguments, 

278 gwjob.cmdvals, 

279 gwjob.name, 

280 generic_workflow.get_job_inputs(gwjob.name) + generic_workflow.get_job_outputs(gwjob.name), 

281 ) 

282 

283 for gwfile in generic_workflow.get_job_inputs(gwjob.name, transfer_only=True): 

284 local_pfns[gwfile.name] = gwfile.src_uri 

285 if os.path.isdir(gwfile.src_uri): 

286 # this is needed to make isdir function working 

287 # properly in ButlerURL instance on the edge node 

288 local_pfns[gwfile.name] += "/" 

289 

290 if gwfile.job_access_remote: 

291 direct_io_files.add(gwfile.name) 

292 

293 if not direct_io_files: 

294 direct_io_files.add("cmdlineplaceholder") 

295 

296 lsst_temp = "LSST_RUN_TEMP_SPACE" 

297 if lsst_temp in file_distribution_end_point and lsst_temp not in os.environ: 

298 file_distribution_end_point = file_distribution_end_point_default 

299 

300 executable = add_decoder_prefix( 

301 config, cmd_line, file_distribution_end_point, (local_pfns, direct_io_files) 

302 ) 

303 work = DomaPanDAWork( 

304 executable=executable, 

305 primary_input_collection={ 

306 "scope": "pseudo_dataset", 

307 "name": f"pseudo_input_collection#{str(task_count)}", 

308 }, 

309 output_collections=[ 

310 {"scope": "pseudo_dataset", "name": f"pseudo_output_collection#{str(task_count)}"} 

311 ], 

312 log_collections=[], 

313 dependency_map=[], 

314 task_name=f"{generic_workflow.name}_{task_count:02d}_{gwjob.label}_{task_chunk:02d}", 

315 task_queue=gwjob.queue, 

316 task_log={ 

317 "destination": "local", 

318 "value": "log.tgz", 

319 "dataset": "PandaJob_#{pandaid}/", 

320 "token": "local", 

321 "param_type": "log", 

322 "type": "template", 

323 }, 

324 encode_command_line=True, 

325 task_rss=task_rss, 

326 task_rss_retry_offset=task_rss_retry_offset, 

327 task_rss_retry_step=task_rss_retry_step, 

328 task_rss_max=gwjob.request_memory_max if gwjob.request_memory_max else PANDA_DEFAULT_RSS_MAX, 

329 task_cloud=gwjob.compute_cloud if gwjob.compute_cloud else PANDA_DEFAULT_CLOUD, 

330 task_site=site, 

331 task_priority=int(gwjob.priority) if gwjob.priority else PANDA_DEFAULT_PRIORITY, 

332 core_count=gwjob.request_cpus if gwjob.request_cpus else PANDA_DEFAULT_CORE_COUNT, 

333 working_group=gwjob.accounting_group, 

334 processing_type=processing_type, 

335 task_type=task_type, 

336 prodSourceLabel=prod_source_label, 

337 vo=vo, 

338 maxattempt=gwjob.number_of_retries if gwjob.number_of_retries else PANDA_DEFAULT_MAX_ATTEMPTS, 

339 maxwalltime=gwjob.request_walltime if gwjob.request_walltime else PANDA_DEFAULT_MAX_WALLTIME, 

340 ) 

341 return work, local_pfns 

342 

343 

344def add_final_idds_work( 

345 config, generic_workflow, idds_client_workflow, dag_sink_work, task_count, task_chunk 

346): 

347 """Add the special final PanDA task to the client workflow. 

348 

349 Parameters 

350 ---------- 

351 config : `lsst.ctrl.bps.BpsConfig` 

352 BPS configuration. 

353 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

354 Generic workflow in which to find the final job. 

355 idds_client_workflow : `idds.workflowv2.workflow.Workflow` 

356 The iDDS client representation of the workflow to which the final task 

357 is added. 

358 dag_sink_work : `list` [`idds.doma.workflowv2.domapandawork.DomaPanDAWork`] 

359 The work nodes in the client workflow which have no successors. 

360 task_count : `int` 

361 Count of PanDA tasks used when making unique names. 

362 task_chunk : `int` 

363 Count of chunk of a PanDA tasks used when making unique names. 

364 

365 Returns 

366 ------- 

367 files : `dict` [`str`, `str`] 

368 Files which need to be copied to a workflow staging area. 

369 

370 Raises 

371 ------ 

372 NotImplementedError 

373 Raised if final job in GenericWorkflow is itself a workflow. 

374 TypeError 

375 Raised if final job in GenericWorkflow is invalid type. 

376 """ 

377 files = {} 

378 

379 # If final job exists in generic workflow, create DAG final job 

380 final = generic_workflow.get_final() 

381 if final: 

382 if isinstance(final, GenericWorkflow): 

383 raise NotImplementedError("PanDA plugin does not support a workflow as the final job") 

384 

385 if not isinstance(final, GenericWorkflowJob): 

386 raise TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final)})") 

387 

388 dag_final_work, files = _make_doma_work( 

389 config, 

390 generic_workflow, 

391 final, 

392 task_count, 

393 task_chunk, 

394 ) 

395 pseudo_filename = "pure_pseudoinput+qgraphNodeId:+qgraphId:" 

396 dag_final_work.dependency_map.append( 

397 {"name": pseudo_filename, "submitted": False, "dependencies": []} 

398 ) 

399 idds_client_workflow.add_work(dag_final_work) 

400 conditions = [] 

401 for work in dag_sink_work: 

402 conditions.append(work.is_terminated) 

403 and_cond = AndCondition(conditions=conditions, true_works=[dag_final_work]) 

404 idds_client_workflow.add_condition(and_cond) 

405 else: 

406 _LOG.debug("No final job in GenericWorkflow") 

407 return files 

408 

409 

410def convert_exec_string_to_hex(cmdline): 

411 """Convert the command line into hex representation. 

412 

413 This step is currently involved because large blocks of command lines 

414 including special symbols passed to the pilot/container. To make sure 

415 the 1 to 1 matching and pass by the special symbol stripping 

416 performed by the Pilot we applied the hexing. 

417 

418 Parameters 

419 ---------- 

420 cmdline : `str` 

421 UTF-8 command line string. 

422 

423 Returns 

424 ------- 

425 hex : `str` 

426 Hex representation of string. 

427 """ 

428 return binascii.hexlify(cmdline.encode()).decode("utf-8") 

429 

430 

431def add_decoder_prefix(config, cmd_line, distribution_path, files): 

432 """Compose the command line sent to the pilot from the functional part 

433 (the actual SW running) and the middleware part (containers invocation). 

434 

435 Parameters 

436 ---------- 

437 config : `lsst.ctrl.bps.BpsConfig` 

438 Configuration information. 

439 cmd_line : `str` 

440 UTF-8 based functional part of the command line. 

441 distribution_path : `str` 

442 URI of path where all files are located for distribution. 

443 files : `tuple` [`dict` [`str`, `str`], `list` [`str`]] 

444 File names needed for a task (copied local, direct access). 

445 

446 Returns 

447 ------- 

448 decoder_prefix : `str` 

449 Full command line to be executed on the edge node. 

450 """ 

451 # Manipulate file paths for placement on cmdline 

452 files_plc_hldr = {} 

453 for key, pfn in files[0].items(): 

454 if pfn.endswith("/"): 

455 files_plc_hldr[key] = os.path.basename(pfn[:-1]) 

456 isdir = True 

457 else: 

458 files_plc_hldr[key] = os.path.basename(pfn) 

459 _, extension = os.path.splitext(pfn) 

460 isdir = os.path.isdir(pfn) or (key == "butlerConfig" and extension != "yaml") 

461 if isdir: 

462 # this is needed to make isdir function working 

463 # properly in ButlerURL instance on the egde node 

464 files_plc_hldr[key] += "/" 

465 _LOG.debug("files_plc_hldr[%s] = %s", key, files_plc_hldr[key]) 

466 

467 cmdline_hex = convert_exec_string_to_hex(cmd_line) 

468 _, runner_command = config.search("runnerCommand", opt={"replaceEnvVars": False, "expandEnvVars": False}) 

469 runner_command = runner_command.replace("\n", " ") 

470 decoder_prefix = runner_command.replace( 

471 "_cmd_line_", 

472 str(cmdline_hex) 

473 + " ${IN/L} " 

474 + distribution_path 

475 + " " 

476 + "+".join(f"{k}:{v}" for k, v in files_plc_hldr.items()) 

477 + " " 

478 + "+".join(files[1]), 

479 ) 

480 return decoder_prefix 

481 

482 

483def add_idds_work(config, generic_workflow, idds_workflow): 

484 """Convert GenericWorkflowJobs to iDDS work and add them to the iDDS 

485 workflow. 

486 

487 Parameters 

488 ---------- 

489 config : `lsst.ctrl.bps.BpsConfig` 

490 BPS configuration. 

491 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

492 Generic workflow containing jobs to convert. 

493 idds_workflow : `idds.workflowv2.workflow.Workflow` 

494 The iDDS workflow to which the converted jobs should be added. 

495 

496 Returns 

497 ------- 

498 files_to_pre_stage : `dict` [`str`, `str`] 

499 Files that need to be copied to the staging area before submission. 

500 dag_sink_work : `list` [`idds.doma.workflowv2.domapandawork.DomaPanDAWork`] 

501 The work nodes in the client workflow which have no successors. 

502 task_count : `int` 

503 Number of tasks in iDDS workflow used for unique task names. 

504 

505 Raises 

506 ------ 

507 RuntimeError 

508 If cannot recover from dependency issues after pass through workflow. 

509 """ 

510 # Limit number of jobs in single PanDA task 

511 _, max_jobs_per_task = config.search("maxJobsPerTask", opt={"default": PANDA_DEFAULT_MAX_JOBS_PER_TASK}) 

512 

513 files_to_pre_stage = {} 

514 dag_sink_work = [] # Workflow sink nodes that need to be connected to final task 

515 job_to_task = {} 

516 job_to_pseudo_filename = {} 

517 task_count = 0 # Task number/ID in idds workflow used for unique name 

518 

519 # To avoid dying due to optimizing number of times through workflow, 

520 # catch dependency issues to loop through again later. 

521 jobs_with_dependency_issues = {} 

522 

523 # Assume jobs with same label share config values 

524 for job_label in generic_workflow.labels: 

525 _LOG.debug("job_label = %s", job_label) 

526 # Add each job with a particular label to a corresponding PanDA task 

527 # A PanDA task has a limit on number of jobs, so break into multiple 

528 # PanDA tasks if needed. 

529 job_count = 0 # Number of jobs in idds task used for task chunking 

530 task_chunk = 1 # Task chunk number within job label used for unique name 

531 work = None 

532 

533 # Instead of changing code to make chunks up front and round-robin 

534 # assign jobs to chunks, for now keeping chunk creation in loop 

535 # but using knowledge of how many chunks there will be to set better 

536 # maximum number of jobs in a chunk for more even distribution. 

537 jobs_by_label = generic_workflow.get_jobs_by_label(job_label) 

538 num_chunks = -(-len(jobs_by_label) // max_jobs_per_task) # ceil 

539 max_jobs_per_task_this_label = -(-len(jobs_by_label) // num_chunks) 

540 _LOG.debug( 

541 "For job_label = %s, num jobs = %s, num_chunks = %s, max_jobs = %s", 

542 job_label, 

543 len(jobs_by_label), 

544 num_chunks, 

545 max_jobs_per_task_this_label, 

546 ) 

547 for gwjob in jobs_by_label: 

548 job_count += 1 

549 if job_count > max_jobs_per_task_this_label: 

550 job_count = 1 

551 task_chunk += 1 

552 

553 if job_count == 1: 

554 # Create new PanDA task object 

555 task_count += 1 

556 work, files = _make_doma_work(config, generic_workflow, gwjob, task_count, task_chunk) 

557 files_to_pre_stage.update(files) 

558 idds_workflow.add_work(work) 

559 if generic_workflow.out_degree(gwjob.name) == 0: 

560 dag_sink_work.append(work) 

561 

562 pseudo_filename = _make_pseudo_filename(config, gwjob) 

563 job_to_pseudo_filename[gwjob.name] = pseudo_filename 

564 job_to_task[gwjob.name] = work.get_work_name() 

565 deps = [] 

566 missing_deps = False 

567 for parent_job_name in generic_workflow.predecessors(gwjob.name): 

568 if parent_job_name not in job_to_task: 

569 _LOG.debug("job_to_task.keys() = %s", job_to_task.keys()) 

570 missing_deps = True 

571 break 

572 else: 

573 deps.append( 

574 { 

575 "task": job_to_task[parent_job_name], 

576 "inputname": job_to_pseudo_filename[parent_job_name], 

577 "available": False, 

578 } 

579 ) 

580 if not missing_deps: 

581 work.dependency_map.append({"name": pseudo_filename, "dependencies": deps}) 

582 else: 

583 jobs_with_dependency_issues[gwjob.name] = work 

584 

585 # If there were any issues figuring out dependencies through earlier loop 

586 if jobs_with_dependency_issues: 

587 _LOG.warning("Could not prepare workflow in single pass. Please notify developers.") 

588 _LOG.info("Trying to recover...") 

589 for job_name, work in jobs_with_dependency_issues.items(): 

590 deps = [] 

591 for parent_job_name in generic_workflow.predecessors(job_name): 

592 if parent_job_name not in job_to_task: 

593 _LOG.debug("job_to_task.keys() = %s", job_to_task.keys()) 

594 raise RuntimeError( 

595 "Could not recover from dependency issues ({job_name} missing {parent_job_name})." 

596 ) 

597 deps.append( 

598 { 

599 "task": job_to_task[parent_job_name], 

600 "inputname": job_to_pseudo_filename[parent_job_name], 

601 "available": False, 

602 } 

603 ) 

604 pseudo_filename = job_to_pseudo_filename[job_name] 

605 work.dependency_map.append({"name": pseudo_filename, "dependencies": deps}) 

606 _LOG.info("Successfully recovered.") 

607 

608 return files_to_pre_stage, dag_sink_work, task_count 

609 

610 

611def create_archive_file(submit_path, archive_filename, files): 

612 if not archive_filename.startswith("/"): 

613 archive_filename = os.path.join(submit_path, archive_filename) 

614 

615 with tarfile.open(archive_filename, "w:gz", dereference=True) as tar: 

616 for local_file in files: 

617 base_name = os.path.basename(local_file) 

618 tar.add(local_file, arcname=os.path.basename(base_name)) 

619 return archive_filename 

620 

621 

622def copy_files_to_pandacache(filename): 

623 from pandaclient import Client 

624 

625 attempt = 0 

626 max_attempts = 3 

627 done = False 

628 while attempt < max_attempts and not done: 

629 status, out = Client.putFile(filename, True) 

630 if status == 0: 

631 done = True 

632 print(f"copy_files_to_pandacache: status: {status}, out: {out}") 

633 if out.startswith("NewFileName:"): 

634 # found the same input sandbox to reuse 

635 filename = out.split(":")[-1] 

636 elif out != "True": 

637 print(out) 

638 return None 

639 

640 filename = os.path.basename(filename) 

641 cache_path = os.path.join(os.environ["PANDACACHE_URL"], "cache") 

642 filename = os.path.join(cache_path, filename) 

643 return filename 

644 

645 

646def get_task_parameter(config, remote_build, key): 

647 search_opt = {"replaceVars": True, "expandEnvVars": False, "replaceEnvVars": False, "required": False} 

648 _, value = remote_build.search(key, search_opt) 

649 if not value: 

650 _, value = config.search(key, search_opt) 

651 return value 

652 

653 

654def create_idds_build_workflow(**kwargs): 

655 config = kwargs["config"] if "config" in kwargs else None 

656 remote_build = kwargs["remote_build"] if "remote_build" in kwargs else None 

657 config_file = kwargs["config_file"] if "config_file" in kwargs else None 

658 config_file_base = os.path.basename(config_file) if config_file else None 

659 compute_site = kwargs["compute_site"] if "compute_site" in kwargs else None 

660 _, files = remote_build.search("files", opt={"default": []}) 

661 submit_path = config["submitPath"] 

662 files.append(config_file) 

663 archive_filename = "jobO.%s.tar.gz" % str(uuid.uuid4()) 

664 archive_filename = create_archive_file(submit_path, archive_filename, files) 

665 _LOG.info("archive file name: %s" % archive_filename) 

666 remote_filename = copy_files_to_pandacache(archive_filename) 

667 _LOG.info("pandacache file: %s" % remote_filename) 

668 

669 _LOG.info(type(remote_build)) 

670 search_opt = {"replaceVars": True, "expandEnvVars": False, "replaceEnvVars": False, "required": False} 

671 cvals = {"LSST_VERSION": get_task_parameter(config, remote_build, "LSST_VERSION")} 

672 cvals["custom_lsst_setup"] = get_task_parameter(config, remote_build, "custom_lsst_setup") 

673 search_opt["curvals"] = cvals 

674 _, executable = remote_build.search("runnerCommand", opt=search_opt) 

675 executable = executable.replace("_download_cmd_line_", remote_filename) 

676 executable = executable.replace("_build_cmd_line_", config_file_base) 

677 executable = executable.replace("_compute_site_", compute_site or "") 

678 

679 task_cloud = get_task_parameter(config, remote_build, "computeCloud") 

680 task_site = get_task_parameter(config, remote_build, "computeSite") 

681 task_queue = get_task_parameter(config, remote_build, "queue") 

682 task_rss = get_task_parameter(config, remote_build, "requestMemory") 

683 nretries = get_task_parameter(config, remote_build, "numberOfRetries") 

684 _LOG.info("requestMemory: %s", task_rss) 

685 _LOG.info("Site: %s", task_site) 

686 # _LOG.info("executable: %s", executable) 

687 # TODO: fill other parameters based on config 

688 build_work = DomaPanDAWork( 

689 executable=executable, 

690 task_type="lsst_build", 

691 primary_input_collection={"scope": "pseudo_dataset", "name": "pseudo_input_collection#1"}, 

692 output_collections=[{"scope": "pseudo_dataset", "name": "pseudo_output_collection#1"}], 

693 log_collections=[], 

694 dependency_map=None, 

695 task_name="build_task", 

696 task_queue=task_queue, 

697 encode_command_line=True, 

698 prodSourceLabel="managed", 

699 task_log={ 

700 "dataset": "PandaJob_#{pandaid}/", 

701 "destination": "local", 

702 "param_type": "log", 

703 "token": "local", 

704 "type": "template", 

705 "value": "log.tgz", 

706 }, 

707 task_rss=task_rss if task_rss else PANDA_DEFAULT_RSS, 

708 task_cloud=task_cloud, 

709 task_site=task_site, 

710 maxattempt=nretries if nretries > 0 else PANDA_DEFAULT_MAX_ATTEMPTS, 

711 ) 

712 

713 workflow = IDDS_client_workflow() 

714 

715 workflow.add_work(build_work) 

716 workflow.name = config["bps_defined"]["uniqProcName"] 

717 return workflow