Coverage for python/lsst/ctrl/bps/transform.py: 8%

383 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-23 11:04 +0000

1# This file is part of ctrl_bps. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <https://www.gnu.org/licenses/>. 

27 

28"""Driver for the transformation of a QuantumGraph into a generic workflow. 

29""" 

30 

31import copy 

32import dataclasses 

33import logging 

34import math 

35import os 

36import re 

37 

38from lsst.utils.logging import VERBOSE 

39from lsst.utils.timer import time_this, timeMethod 

40 

41from . import ( 

42 DEFAULT_MEM_RETRIES, 

43 BpsConfig, 

44 GenericWorkflow, 

45 GenericWorkflowExec, 

46 GenericWorkflowFile, 

47 GenericWorkflowJob, 

48) 

49from .bps_utils import ( 

50 WhenToSaveQuantumGraphs, 

51 _create_execution_butler, 

52 create_job_quantum_graph_filename, 

53 save_qg_subgraph, 

54) 

55 

56# All available job attributes. 

57_ATTRS_ALL = frozenset([field.name for field in dataclasses.fields(GenericWorkflowJob)]) 

58 

59# Job attributes that need to be set to their maximal value in the cluster. 

60_ATTRS_MAX = frozenset( 

61 { 

62 "memory_multiplier", 

63 "number_of_retries", 

64 "request_cpus", 

65 "request_memory", 

66 "request_memory_max", 

67 } 

68) 

69 

70# Job attributes that need to be set to sum of their values in the cluster. 

71_ATTRS_SUM = frozenset( 

72 { 

73 "request_disk", 

74 "request_walltime", 

75 } 

76) 

77 

78# Job attributes do not fall into a specific category 

79_ATTRS_MISC = frozenset( 

80 { 

81 "label", # taskDef labels aren't same in job and may not match job label 

82 "cmdvals", 

83 "profile", 

84 "attrs", 

85 } 

86) 

87 

88# Attributes that need to be the same for each quanta in the cluster. 

89_ATTRS_UNIVERSAL = frozenset(_ATTRS_ALL - (_ATTRS_MAX | _ATTRS_MISC | _ATTRS_SUM)) 

90 

91_LOG = logging.getLogger(__name__) 

92 

93 

94@timeMethod(logger=_LOG, logLevel=VERBOSE) 

95def transform(config, cqgraph, prefix): 

96 """Transform a ClusteredQuantumGraph to a GenericWorkflow. 

97 

98 Parameters 

99 ---------- 

100 config : `lsst.ctrl.bps.BpsConfig` 

101 BPS configuration. 

102 cqgraph : `lsst.ctrl.bps.ClusteredQuantumGraph` 

103 A clustered quantum graph to transform into a generic workflow. 

104 prefix : `str` 

105 Root path for any output files. 

106 

107 Returns 

108 ------- 

109 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

110 The generic workflow transformed from the clustered quantum graph. 

111 generic_workflow_config : `lsst.ctrl.bps.BpsConfig` 

112 Configuration to accompany GenericWorkflow. 

113 """ 

114 _, when_create = config.search(".executionButler.whenCreate") 

115 if when_create.upper() == "TRANSFORM": 

116 _, execution_butler_dir = config.search(".bps_defined.executionButlerDir") 

117 _LOG.info("Creating execution butler in '%s'", execution_butler_dir) 

118 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Creating execution butler completed"): 

119 _create_execution_butler(config, config["runQgraphFile"], execution_butler_dir, prefix) 

120 

121 if cqgraph.name is not None: 

122 name = cqgraph.name 

123 else: 

124 _, name = config.search("uniqProcName", opt={"required": True}) 

125 

126 generic_workflow = create_generic_workflow(config, cqgraph, name, prefix) 

127 generic_workflow_config = create_generic_workflow_config(config, prefix) 

128 

129 return generic_workflow, generic_workflow_config 

130 

131 

132def add_workflow_init_nodes(config, qgraph, generic_workflow): 

133 """Add nodes to workflow graph that perform initialization steps. 

134 

135 Assumes that all of the initialization should be executed prior to any 

136 of the current workflow. 

137 

138 Parameters 

139 ---------- 

140 config : `lsst.ctrl.bps.BpsConfig` 

141 BPS configuration. 

142 qgraph : `lsst.pipe.base.graph.QuantumGraph` 

143 The quantum graph the generic workflow represents. 

144 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

145 Generic workflow to which the initialization steps should be added. 

146 """ 

147 # Create a workflow graph that will have task and file nodes necessary for 

148 # initializing the pipeline execution 

149 init_workflow = create_init_workflow(config, qgraph, generic_workflow.get_file("runQgraphFile")) 

150 _LOG.debug("init_workflow nodes = %s", init_workflow.nodes()) 

151 generic_workflow.add_workflow_source(init_workflow) 

152 

153 

154def create_init_workflow(config, qgraph, qgraph_gwfile): 

155 """Create workflow for running initialization job(s). 

156 

157 Parameters 

158 ---------- 

159 config : `lsst.ctrl.bps.BpsConfig` 

160 BPS configuration. 

161 qgraph : `lsst.pipe.base.graph.QuantumGraph` 

162 The quantum graph the generic workflow represents. 

163 qgraph_gwfile : `lsst.ctrl.bps.GenericWorkflowFile` 

164 File object for the full run QuantumGraph file. 

165 

166 Returns 

167 ------- 

168 init_workflow : `lsst.ctrl.bps.GenericWorkflow` 

169 GenericWorkflow consisting of job(s) to initialize workflow. 

170 """ 

171 _LOG.debug("creating init subgraph") 

172 _LOG.debug("creating init task input(s)") 

173 search_opt = { 

174 "curvals": {"curr_pipetask": "pipetaskInit"}, 

175 "replaceVars": False, 

176 "expandEnvVars": False, 

177 "replaceEnvVars": True, 

178 "required": False, 

179 } 

180 found, value = config.search("computeSite", opt=search_opt) 

181 if found: 

182 search_opt["curvals"]["curr_site"] = value 

183 found, value = config.search("computeCloud", opt=search_opt) 

184 if found: 

185 search_opt["curvals"]["curr_cloud"] = value 

186 

187 init_workflow = GenericWorkflow("init") 

188 init_workflow.add_file(qgraph_gwfile) 

189 

190 # create job for executing --init-only 

191 gwjob = GenericWorkflowJob("pipetaskInit", label="pipetaskInit") 

192 

193 job_values = _get_job_values(config, search_opt, "runQuantumCommand") 

194 job_values["name"] = "pipetaskInit" 

195 job_values["label"] = "pipetaskInit" 

196 

197 # Adjust job attributes values if necessary. 

198 _handle_job_values(job_values, gwjob) 

199 

200 # Pick a node id for each task (not quantum!) to avoid reading the entire 

201 # quantum graph during the initialization stage. 

202 node_ids = [] 

203 for task in qgraph.iterTaskGraph(): 

204 task_def = qgraph.findTaskDefByLabel(task.label) 

205 node = next(iter(qgraph.getNodesForTask(task_def))) 

206 node_ids.append(node.nodeId) 

207 gwjob.cmdvals["qgraphId"] = qgraph.graphID 

208 gwjob.cmdvals["qgraphNodeId"] = ",".join(sorted([f"{node_id}" for node_id in node_ids])) 

209 

210 init_workflow.add_job(gwjob) 

211 

212 # Lookup butler values 

213 _, when_create = config.search(".executionButler.whenCreate", opt=search_opt) 

214 _, butler_config = config.search("butlerConfig", opt=search_opt) 

215 _, execution_butler_dir = config.search(".bps_defined.executionButlerDir", opt=search_opt) 

216 prefix = config["submitPath"] 

217 butler_gwfile = _get_butler_gwfile(prefix, when_create, butler_config, execution_butler_dir) 

218 

219 init_workflow.add_job_inputs(gwjob.name, [qgraph_gwfile, butler_gwfile]) 

220 _enhance_command(config, init_workflow, gwjob, {}) 

221 

222 return init_workflow 

223 

224 

225def _enhance_command(config, generic_workflow, gwjob, cached_job_values): 

226 """Enhance command line with env and file placeholders 

227 and gather command line values. 

228 

229 Parameters 

230 ---------- 

231 config : `lsst.ctrl.bps.BpsConfig` 

232 BPS configuration. 

233 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

234 Generic workflow that contains the job. 

235 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

236 Generic workflow job to which the updated executable, arguments, 

237 and values should be saved. 

238 cached_job_values : `dict` [`str`, dict[`str`, `Any`]] 

239 Cached values common across jobs with same label. Updated if values 

240 aren't already saved for given gwjob's label. 

241 """ 

242 _LOG.debug("gwjob given to _enhance_command: %s", gwjob) 

243 

244 search_opt = { 

245 "curvals": {"curr_pipetask": gwjob.label}, 

246 "replaceVars": False, 

247 "expandEnvVars": False, 

248 "replaceEnvVars": True, 

249 "required": False, 

250 } 

251 

252 if gwjob.label not in cached_job_values: 

253 cached_job_values[gwjob.label] = {} 

254 # Allowing whenSaveJobQgraph and useLazyCommands per pipetask label. 

255 key = "whenSaveJobQgraph" 

256 _, when_save = config.search(key, opt=search_opt) 

257 cached_job_values[gwjob.label][key] = WhenToSaveQuantumGraphs[when_save.upper()] 

258 

259 key = "useLazyCommands" 

260 search_opt["default"] = True 

261 _, cached_job_values[gwjob.label][key] = config.search(key, opt=search_opt) 

262 del search_opt["default"] 

263 

264 # Change qgraph variable to match whether using run or per-job qgraph 

265 # Note: these are lookup keys, not actual physical filenames. 

266 if cached_job_values[gwjob.label]["whenSaveJobQgraph"] == WhenToSaveQuantumGraphs.NEVER: 

267 gwjob.arguments = gwjob.arguments.replace("{qgraphFile}", "{runQgraphFile}") 

268 elif gwjob.name == "pipetaskInit": 

269 gwjob.arguments = gwjob.arguments.replace("{qgraphFile}", "{runQgraphFile}") 

270 else: # Needed unique file keys for per-job QuantumGraphs 

271 gwjob.arguments = gwjob.arguments.replace("{qgraphFile}", f"{{qgraphFile_{gwjob.name}}}") 

272 

273 # Replace files with special placeholders 

274 for gwfile in generic_workflow.get_job_inputs(gwjob.name): 

275 gwjob.arguments = gwjob.arguments.replace(f"{{{gwfile.name}}}", f"<FILE:{gwfile.name}>") 

276 for gwfile in generic_workflow.get_job_outputs(gwjob.name): 

277 gwjob.arguments = gwjob.arguments.replace(f"{{{gwfile.name}}}", f"<FILE:{gwfile.name}>") 

278 

279 # Save dict of other values needed to complete command line. 

280 # (Be careful to not replace env variables as they may 

281 # be different in compute job.) 

282 search_opt["replaceVars"] = True 

283 

284 for key in re.findall(r"{([^}]+)}", gwjob.arguments): 

285 if key not in gwjob.cmdvals: 

286 if key not in cached_job_values[gwjob.label]: 

287 _, cached_job_values[gwjob.label][key] = config.search(key, opt=search_opt) 

288 gwjob.cmdvals[key] = cached_job_values[gwjob.label][key] 

289 

290 # backwards compatibility 

291 if not cached_job_values[gwjob.label]["useLazyCommands"]: 

292 if "bpsUseShared" not in cached_job_values[gwjob.label]: 

293 key = "bpsUseShared" 

294 search_opt["default"] = True 

295 _, cached_job_values[gwjob.label][key] = config.search(key, opt=search_opt) 

296 del search_opt["default"] 

297 

298 gwjob.arguments = _fill_arguments( 

299 cached_job_values[gwjob.label]["bpsUseShared"], generic_workflow, gwjob.arguments, gwjob.cmdvals 

300 ) 

301 

302 

303def _fill_arguments(use_shared, generic_workflow, arguments, cmdvals): 

304 """Replace placeholders in command line string in job. 

305 

306 Parameters 

307 ---------- 

308 use_shared : `bool` 

309 Whether using shared filesystem. 

310 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

311 Generic workflow containing the job. 

312 arguments : `str` 

313 String containing placeholders. 

314 cmdvals : `dict` [`str`, `Any`] 

315 Any command line values that can be used to replace placeholders. 

316 

317 Returns 

318 ------- 

319 arguments : `str` 

320 Command line with FILE and ENV placeholders replaced. 

321 """ 

322 # Replace file placeholders 

323 for file_key in re.findall(r"<FILE:([^>]+)>", arguments): 

324 gwfile = generic_workflow.get_file(file_key) 

325 if not gwfile.wms_transfer: 

326 # Must assume full URI if in command line and told WMS is not 

327 # responsible for transferring file. 

328 uri = gwfile.src_uri 

329 elif use_shared: 

330 if gwfile.job_shared: 

331 # Have shared filesystems and jobs can share file. 

332 uri = gwfile.src_uri 

333 else: 

334 # Taking advantage of inside knowledge. Not future-proof. 

335 # Temporary fix until have job wrapper that pulls files 

336 # within job. 

337 if gwfile.name == "butlerConfig" and os.path.splitext(gwfile.src_uri)[1] != ".yaml": 

338 uri = "butler.yaml" 

339 else: 

340 uri = os.path.basename(gwfile.src_uri) 

341 else: # Using push transfer 

342 uri = os.path.basename(gwfile.src_uri) 

343 

344 arguments = arguments.replace(f"<FILE:{file_key}>", uri) 

345 

346 # Replace env placeholder with submit-side values 

347 arguments = re.sub(r"<ENV:([^>]+)>", r"$\1", arguments) 

348 arguments = os.path.expandvars(arguments) 

349 

350 # Replace remaining vars 

351 arguments = arguments.format(**cmdvals) 

352 

353 return arguments 

354 

355 

356def _get_butler_gwfile(prefix, when_create, butler_config, execution_butler_dir): 

357 """Get butler location to be used by job. 

358 

359 Parameters 

360 ---------- 

361 prefix : `str` 

362 Root path for any output files. 

363 when_create : `str` 

364 When to create the execution butler used to determine whether job is 

365 using execution butler or not. 

366 butler_config : `str` 

367 Location of central butler repositories config file. 

368 execution_butler_dir : `str` 

369 Location of execution butler repository. 

370 

371 Returns 

372 ------- 

373 gwfile : `lsst.ctrl.bps.GenericWorkflowFile` 

374 Representation of butler location. 

375 """ 

376 if when_create.upper() == "NEVER": 

377 wms_transfer = False 

378 job_access_remote = True 

379 job_shared = True 

380 else: 

381 butler_config = execution_butler_dir 

382 if not butler_config.startswith("/"): 

383 butler_config = f"{prefix}/{butler_config}" 

384 wms_transfer = True 

385 job_access_remote = False 

386 job_shared = False 

387 

388 gwfile = GenericWorkflowFile( 

389 "butlerConfig", 

390 src_uri=butler_config, 

391 wms_transfer=wms_transfer, 

392 job_access_remote=job_access_remote, 

393 job_shared=job_shared, 

394 ) 

395 

396 return gwfile 

397 

398 

399def _get_qgraph_gwfile(config, save_qgraph_per_job, gwjob, run_qgraph_file, prefix): 

400 """Get qgraph location to be used by job. 

401 

402 Parameters 

403 ---------- 

404 config : `lsst.ctrl.bps.BpsConfig` 

405 Bps configuration. 

406 save_qgraph_per_job : `lsst.ctrl.bps.bps_utils.WhenToSaveQuantumGraphs` 

407 What submission stage to save per-job qgraph files (or NEVER) 

408 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

409 Job for which determining QuantumGraph file. 

410 run_qgraph_file : `lsst.ctrl.bps.GenericWorkflowFile` 

411 File representation of the full run QuantumGraph. 

412 prefix : `str` 

413 Path prefix for any files written. 

414 

415 Returns 

416 ------- 

417 gwfile : `lsst.ctrl.bps.GenericWorkflowFile` 

418 Representation of butler location (may not include filename). 

419 """ 

420 qgraph_gwfile = None 

421 if save_qgraph_per_job != WhenToSaveQuantumGraphs.NEVER: 

422 qgraph_gwfile = GenericWorkflowFile( 

423 f"qgraphFile_{gwjob.name}", 

424 src_uri=create_job_quantum_graph_filename(config, gwjob, prefix), 

425 wms_transfer=True, 

426 job_access_remote=True, 

427 job_shared=True, 

428 ) 

429 else: 

430 qgraph_gwfile = run_qgraph_file 

431 

432 return qgraph_gwfile 

433 

434 

435def _get_job_values(config, search_opt, cmd_line_key): 

436 """Gather generic workflow job values from the bps config. 

437 

438 Parameters 

439 ---------- 

440 config : `lsst.ctrl.bps.BpsConfig` 

441 Bps configuration. 

442 search_opt : `dict` [`str`, `Any`] 

443 Search options to be used when searching config. 

444 cmd_line_key : `str` or None 

445 Which command line key to search for (e.g., "runQuantumCommand"). 

446 

447 Returns 

448 ------- 

449 job_values : `dict` [ `str`, `Any` ]` 

450 A mapping between job attributes and their values. 

451 """ 

452 _LOG.debug("cmd_line_key=%s, search_opt=%s", cmd_line_key, search_opt) 

453 

454 # Create a dummy job to easily access the default values. 

455 default_gwjob = GenericWorkflowJob("default_job") 

456 

457 job_values = {} 

458 for attr in _ATTRS_ALL: 

459 # Variable names in yaml are camel case instead of snake case. 

460 yaml_name = re.sub(r"_(\S)", lambda match: match.group(1).upper(), attr) 

461 found, value = config.search(yaml_name, opt=search_opt) 

462 if found: 

463 job_values[attr] = value 

464 else: 

465 job_values[attr] = getattr(default_gwjob, attr) 

466 

467 # If the automatic memory scaling is enabled (i.e. the memory multiplier 

468 # is set and it is a positive number greater than 1.0), adjust number 

469 # of retries when necessary. If the memory multiplier is invalid, disable 

470 # automatic memory scaling. 

471 if job_values["memory_multiplier"] is not None: 

472 if math.ceil(float(job_values["memory_multiplier"])) > 1: 

473 if job_values["number_of_retries"] is None: 

474 job_values["number_of_retries"] = DEFAULT_MEM_RETRIES 

475 else: 

476 job_values["memory_multiplier"] = None 

477 

478 if cmd_line_key: 

479 found, cmdline = config.search(cmd_line_key, opt=search_opt) 

480 # Make sure cmdline isn't None as that could be sent in as a 

481 # default value in search_opt. 

482 if found and cmdline: 

483 cmd, args = cmdline.split(" ", 1) 

484 job_values["executable"] = GenericWorkflowExec(os.path.basename(cmd), cmd, False) 

485 if args: 

486 job_values["arguments"] = args 

487 

488 return job_values 

489 

490 

491def _handle_job_values(quantum_job_values, gwjob, attributes=_ATTRS_ALL): 

492 """Set the job attributes in the cluster to their correct values. 

493 

494 Parameters 

495 ---------- 

496 quantum_job_values : `dict` [`str`, Any] 

497 Job values for running single Quantum. 

498 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

499 Generic workflow job in which to store the universal values. 

500 attributes : `Iterable` [`str`], optional 

501 Job attributes to be set in the job following different rules. 

502 The default value is _ATTRS_ALL. 

503 """ 

504 _LOG.debug("Call to _handle_job_values") 

505 _handle_job_values_universal(quantum_job_values, gwjob, attributes) 

506 _handle_job_values_max(quantum_job_values, gwjob, attributes) 

507 _handle_job_values_sum(quantum_job_values, gwjob, attributes) 

508 

509 

510def _handle_job_values_universal(quantum_job_values, gwjob, attributes=_ATTRS_UNIVERSAL): 

511 """Handle job attributes that must have the same value for every quantum 

512 in the cluster. 

513 

514 Parameters 

515 ---------- 

516 quantum_job_values : `dict` [`str`, Any] 

517 Job values for running single Quantum. 

518 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

519 Generic workflow job in which to store the universal values. 

520 attributes : `Iterable` [`str`], optional 

521 Job attributes to be set in the job following different rules. 

522 The default value is _ATTRS_UNIVERSAL. 

523 """ 

524 for attr in _ATTRS_UNIVERSAL & set(attributes): 

525 _LOG.debug( 

526 "Handling job %s (job=%s, quantum=%s)", 

527 attr, 

528 getattr(gwjob, attr), 

529 quantum_job_values.get(attr, "MISSING"), 

530 ) 

531 current_value = getattr(gwjob, attr) 

532 try: 

533 quantum_value = quantum_job_values[attr] 

534 except KeyError: 

535 continue 

536 else: 

537 if not current_value: 

538 setattr(gwjob, attr, quantum_value) 

539 elif current_value != quantum_value: 

540 _LOG.error( 

541 "Inconsistent value for %s in Cluster %s Quantum Number %s\n" 

542 "Current cluster value: %s\n" 

543 "Quantum value: %s", 

544 attr, 

545 gwjob.name, 

546 quantum_job_values.get("qgraphNodeId", "MISSING"), 

547 current_value, 

548 quantum_value, 

549 ) 

550 raise RuntimeError(f"Inconsistent value for {attr} in cluster {gwjob.name}.") 

551 

552 

553def _handle_job_values_max(quantum_job_values, gwjob, attributes=_ATTRS_MAX): 

554 """Handle job attributes that should be set to their maximum value in 

555 the in cluster. 

556 

557 Parameters 

558 ---------- 

559 quantum_job_values : `dict` [`str`, `Any`] 

560 Job values for running single Quantum. 

561 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

562 Generic workflow job in which to store the aggregate values. 

563 attributes : `Iterable` [`str`], optional 

564 Job attributes to be set in the job following different rules. 

565 The default value is _ATTR_MAX. 

566 """ 

567 for attr in _ATTRS_MAX & set(attributes): 

568 current_value = getattr(gwjob, attr) 

569 try: 

570 quantum_value = quantum_job_values[attr] 

571 except KeyError: 

572 continue 

573 else: 

574 needs_update = False 

575 if current_value is None: 

576 if quantum_value is not None: 

577 needs_update = True 

578 else: 

579 if quantum_value is not None and current_value < quantum_value: 

580 needs_update = True 

581 if needs_update: 

582 setattr(gwjob, attr, quantum_value) 

583 

584 # When updating memory requirements for a job, check if memory 

585 # autoscaling is enabled. If it is, always use the memory 

586 # multiplier and the number of retries which comes with the 

587 # quantum. 

588 # 

589 # Note that as a result, the quantum with the biggest memory 

590 # requirements will determine whether the memory autoscaling 

591 # will be enabled (or disabled) depending on the value of its 

592 # memory multiplier. 

593 if attr == "request_memory": 

594 gwjob.memory_multiplier = quantum_job_values["memory_multiplier"] 

595 if gwjob.memory_multiplier is not None: 

596 gwjob.number_of_retries = quantum_job_values["number_of_retries"] 

597 

598 

599def _handle_job_values_sum(quantum_job_values, gwjob, attributes=_ATTRS_SUM): 

600 """Handle job attributes that are the sum of their values in the cluster. 

601 

602 Parameters 

603 ---------- 

604 quantum_job_values : `dict` [`str`, `Any`] 

605 Job values for running single Quantum. 

606 gwjob : `lsst.ctrl.bps.GenericWorkflowJob` 

607 Generic workflow job in which to store the aggregate values. 

608 attributes : `Iterable` [`str`], optional 

609 Job attributes to be set in the job following different rules. 

610 The default value is _ATTRS_SUM. 

611 """ 

612 for attr in _ATTRS_SUM & set(attributes): 

613 current_value = getattr(gwjob, attr) 

614 if not current_value: 

615 setattr(gwjob, attr, quantum_job_values[attr]) 

616 else: 

617 setattr(gwjob, attr, current_value + quantum_job_values[attr]) 

618 

619 

620def create_generic_workflow(config, cqgraph, name, prefix): 

621 """Create a generic workflow from a ClusteredQuantumGraph such that it 

622 has information needed for WMS (e.g., command lines). 

623 

624 Parameters 

625 ---------- 

626 config : `lsst.ctrl.bps.BpsConfig` 

627 BPS configuration. 

628 cqgraph : `lsst.ctrl.bps.ClusteredQuantumGraph` 

629 ClusteredQuantumGraph for running a specific pipeline on a specific 

630 payload. 

631 name : `str` 

632 Name for the workflow (typically unique). 

633 prefix : `str` 

634 Root path for any output files. 

635 

636 Returns 

637 ------- 

638 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

639 Generic workflow for the given ClusteredQuantumGraph + config. 

640 """ 

641 # Determine whether saving per-job QuantumGraph files in the loop. 

642 _, when_save = config.search("whenSaveJobQgraph", {"default": WhenToSaveQuantumGraphs.TRANSFORM.name}) 

643 save_qgraph_per_job = WhenToSaveQuantumGraphs[when_save.upper()] 

644 

645 search_opt = {"replaceVars": False, "expandEnvVars": False, "replaceEnvVars": True, "required": False} 

646 

647 # Lookup butler values once 

648 _, when_create = config.search(".executionButler.whenCreate", opt=search_opt) 

649 _, butler_config = config.search("butlerConfig", opt=search_opt) 

650 _, execution_butler_dir = config.search(".bps_defined.executionButlerDir", opt=search_opt) 

651 

652 generic_workflow = GenericWorkflow(name) 

653 

654 # Save full run QuantumGraph for use by jobs 

655 generic_workflow.add_file( 

656 GenericWorkflowFile( 

657 "runQgraphFile", 

658 src_uri=config["runQgraphFile"], 

659 wms_transfer=True, 

660 job_access_remote=True, 

661 job_shared=True, 

662 ) 

663 ) 

664 

665 # Cache pipetask specific or more generic job values to minimize number 

666 # on config searches. 

667 cached_job_values = {} 

668 cached_pipetask_values = {} 

669 

670 for cluster in cqgraph.clusters(): 

671 _LOG.debug("Loop over clusters: %s, %s", cluster, type(cluster)) 

672 _LOG.debug( 

673 "cqgraph: name=%s, len=%s, label=%s, ids=%s", 

674 cluster.name, 

675 len(cluster.qgraph_node_ids), 

676 cluster.label, 

677 cluster.qgraph_node_ids, 

678 ) 

679 

680 gwjob = GenericWorkflowJob(cluster.name, label=cluster.label) 

681 

682 # First get job values from cluster or cluster config 

683 search_opt["curvals"] = {"curr_cluster": cluster.label} 

684 found, value = config.search("computeSite", opt=search_opt) 

685 if found: 

686 search_opt["curvals"]["curr_site"] = value 

687 found, value = config.search("computeCloud", opt=search_opt) 

688 if found: 

689 search_opt["curvals"]["curr_cloud"] = value 

690 

691 # If some config values are set for this cluster 

692 if cluster.label not in cached_job_values: 

693 _LOG.debug("config['cluster'][%s] = %s", cluster.label, config["cluster"][cluster.label]) 

694 cached_job_values[cluster.label] = {} 

695 

696 # Allowing whenSaveJobQgraph and useLazyCommands per cluster label. 

697 key = "whenSaveJobQgraph" 

698 _, when_save = config.search(key, opt=search_opt) 

699 cached_job_values[cluster.label][key] = WhenToSaveQuantumGraphs[when_save.upper()] 

700 

701 key = "useLazyCommands" 

702 search_opt["default"] = True 

703 _, cached_job_values[cluster.label][key] = config.search(key, opt=search_opt) 

704 del search_opt["default"] 

705 

706 if cluster.label in config["cluster"]: 

707 # Don't want to get global defaults here so only look in 

708 # cluster section. 

709 cached_job_values[cluster.label].update( 

710 _get_job_values(config["cluster"][cluster.label], search_opt, "runQuantumCommand") 

711 ) 

712 cluster_job_values = copy.copy(cached_job_values[cluster.label]) 

713 

714 cluster_job_values["name"] = cluster.name 

715 cluster_job_values["label"] = cluster.label 

716 cluster_job_values["quanta_counts"] = cluster.quanta_counts 

717 cluster_job_values["tags"] = cluster.tags 

718 _LOG.debug("cluster_job_values = %s", cluster_job_values) 

719 _handle_job_values(cluster_job_values, gwjob, cluster_job_values.keys()) 

720 

721 # For purposes of whether to continue searching for a value is whether 

722 # the value evaluates to False. 

723 unset_attributes = {attr for attr in _ATTRS_ALL if not getattr(gwjob, attr)} 

724 

725 _LOG.debug("unset_attributes=%s", unset_attributes) 

726 _LOG.debug("set=%s", _ATTRS_ALL - unset_attributes) 

727 

728 # For job info not defined at cluster level, attempt to get job info 

729 # either common or aggregate for all Quanta in cluster. 

730 for node_id in iter(cluster.qgraph_node_ids): 

731 _LOG.debug("node_id=%s", node_id) 

732 qnode = cqgraph.get_quantum_node(node_id) 

733 

734 if qnode.taskDef.label not in cached_pipetask_values: 

735 search_opt["curvals"]["curr_pipetask"] = qnode.taskDef.label 

736 cached_pipetask_values[qnode.taskDef.label] = _get_job_values( 

737 config, search_opt, "runQuantumCommand" 

738 ) 

739 

740 _handle_job_values(cached_pipetask_values[qnode.taskDef.label], gwjob, unset_attributes) 

741 

742 # Update job with workflow attribute and profile values. 

743 qgraph_gwfile = _get_qgraph_gwfile( 

744 config, save_qgraph_per_job, gwjob, generic_workflow.get_file("runQgraphFile"), prefix 

745 ) 

746 butler_gwfile = _get_butler_gwfile(prefix, when_create, butler_config, execution_butler_dir) 

747 

748 generic_workflow.add_job(gwjob) 

749 generic_workflow.add_job_inputs(gwjob.name, [qgraph_gwfile, butler_gwfile]) 

750 

751 gwjob.cmdvals["qgraphId"] = cqgraph.qgraph.graphID 

752 gwjob.cmdvals["qgraphNodeId"] = ",".join( 

753 sorted([f"{node_id}" for node_id in cluster.qgraph_node_ids]) 

754 ) 

755 _enhance_command(config, generic_workflow, gwjob, cached_job_values) 

756 

757 # If writing per-job QuantumGraph files during TRANSFORM stage, 

758 # write it now while in memory. 

759 if save_qgraph_per_job == WhenToSaveQuantumGraphs.TRANSFORM: 

760 save_qg_subgraph(cqgraph.qgraph, qgraph_gwfile.src_uri, cluster.qgraph_node_ids) 

761 

762 # Create job dependencies. 

763 for parent in cqgraph.clusters(): 

764 for child in cqgraph.successors(parent): 

765 generic_workflow.add_job_relationships(parent.name, child.name) 

766 

767 # Add initial workflow. 

768 if config.get("runInit", "{default: False}"): 

769 add_workflow_init_nodes(config, cqgraph.qgraph, generic_workflow) 

770 

771 generic_workflow.run_attrs.update( 

772 { 

773 "bps_isjob": "True", 

774 "bps_project": config["project"], 

775 "bps_campaign": config["campaign"], 

776 "bps_run": generic_workflow.name, 

777 "bps_operator": config["operator"], 

778 "bps_payload": config["payloadName"], 

779 "bps_runsite": config["computeSite"], 

780 } 

781 ) 

782 

783 # Add final job 

784 add_final_job(config, generic_workflow, prefix) 

785 

786 return generic_workflow 

787 

788 

789def create_generic_workflow_config(config, prefix): 

790 """Create generic workflow configuration. 

791 

792 Parameters 

793 ---------- 

794 config : `lsst.ctrl.bps.BpsConfig` 

795 Bps configuration. 

796 prefix : `str` 

797 Root path for any output files. 

798 

799 Returns 

800 ------- 

801 generic_workflow_config : `lsst.ctrl.bps.BpsConfig` 

802 Configuration accompanying the GenericWorkflow. 

803 """ 

804 generic_workflow_config = BpsConfig(config) 

805 generic_workflow_config["workflowName"] = config["uniqProcName"] 

806 generic_workflow_config["workflowPath"] = prefix 

807 return generic_workflow_config 

808 

809 

810def add_final_job(config, generic_workflow, prefix): 

811 """Add final workflow job depending upon configuration. 

812 

813 Parameters 

814 ---------- 

815 config : `lsst.ctrl.bps.BpsConfig` 

816 Bps configuration. 

817 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

818 Generic workflow to which attributes should be added. 

819 prefix : `str` 

820 Directory in which to output final script. 

821 

822 Notes 

823 ----- 

824 This dispatch function was introduced to preserve the existing code 

825 responsible for dealing with the execution Butler (EB). Once there is 

826 no need to support the EB any longer it can be replaced by the function 

827 responsible for handling the final job. 

828 """ 

829 # The order of the entries determines the priorities regarding what 

830 # method will be used when adding the final job if the configuration 

831 # provides conflicting specifications. 

832 dispatcher = { 

833 ".finalJob.whenRun": _add_final_job, 

834 ".executionButler.whenCreate": _add_merge_job, 

835 } 

836 for name, func in dispatcher.items(): 

837 if name in config and config[name] != "NEVER": 

838 break 

839 else: 

840 raise RuntimeError("Final job specification not found") 

841 func(config, generic_workflow, prefix) 

842 

843 

844def _add_final_job(config, generic_workflow, prefix): 

845 """Add the final job. 

846 

847 Depending on configuration, the final job will be added as a special job 

848 which will always run regardless of the exit status of the workflow or 

849 a regular sink node which will only run if the workflow execution finished 

850 with no errors. 

851 

852 Parameters 

853 ---------- 

854 config : `lsst.ctrl.bps.BpsConfig` 

855 Bps configuration. 

856 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

857 Generic workflow to which attributes should be added. 

858 prefix : `str` 

859 Directory in which to output final script. 

860 """ 

861 _, when_run = config.search(".finalJob.whenRun") 

862 if when_run.upper() != "NEVER": 

863 create_final_job = _make_final_job_creator("finalJob", _create_final_command) 

864 gwjob = create_final_job(config, generic_workflow, prefix) 

865 if when_run.upper() == "ALWAYS": 

866 generic_workflow.add_final(gwjob) 

867 elif when_run.upper() == "SUCCESS": 

868 add_final_job_as_sink(generic_workflow, gwjob) 

869 else: 

870 raise ValueError(f"Invalid value for finalJob.whenRun: {when_run}") 

871 

872 

873def _add_merge_job(config, generic_workflow, prefix): 

874 """Add job responsible for merging back the execution Butler. 

875 

876 Depending on configuration, the merge job will be added as a special job 

877 which will always run regardless of the exit status of the workflow or 

878 a regular sink node which will only run if the workflow execution finished 

879 with no errors. 

880 

881 Parameters 

882 ---------- 

883 config : `lsst.ctrl.bps.BpsConfig` 

884 Bps configuration. 

885 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

886 Generic workflow to which attributes should be added. 

887 prefix : `str` 

888 Directory in which to output final script. 

889 """ 

890 _, when_create = config.search(".executionButler.whenCreate") 

891 _, when_merge = config.search(".executionButler.whenMerge") 

892 if when_create.upper() != "NEVER" and when_merge.upper() != "NEVER": 

893 create_final_job = _make_final_job_creator("executionButler", _create_merge_command) 

894 gwjob = create_final_job(config, generic_workflow, prefix) 

895 if when_merge.upper() == "ALWAYS": 

896 generic_workflow.add_final(gwjob) 

897 elif when_merge.upper() == "SUCCESS": 

898 add_final_job_as_sink(generic_workflow, gwjob) 

899 else: 

900 raise ValueError(f"Invalid value for executionButler.whenMerge: {when_merge}") 

901 

902 

903def _make_final_job_creator(job_name, create_cmd): 

904 """Construct a function that creates the final job. 

905 

906 Parameters 

907 ---------- 

908 job_name : `str` 

909 Name of the job. It will also be used as the job label. 

910 create_cmd : callable 

911 Function to use when creating the script for the final job. It takes 

912 two positional arguments: 

913 

914 - `config`: run configuration (`BpsConfig`). 

915 - `prefix`: directory in which to output final script (`str`). 

916 

917 Returns 

918 ------- 

919 create_gwjob : callable 

920 Function to use to create a generic workflow job. The function takes 

921 three positional arguments: 

922 

923 - `config`: run configuration (`BpsConfig`). 

924 - `generic_workflow`: generic workflow to which the final job should 

925 be added. 

926 - `prefix`: directory in which to output final script (`str`). 

927 

928 Notes 

929 ----- 

930 Implemented as a closure in order to reduce code duplication and provide 

931 an extra flexibility needed to support the creation of the final node for 

932 both the execution and quantum backed Butler with minimal impact on 

933 the existing code base. Once all supported plugins are able to use 

934 the quantum backed Butler the inner function could be merged with 

935 the remaining function responsible for adding the final node and the 

936 closure can be removed. 

937 """ 

938 

939 def create_final_job(config, generic_workflow, prefix): 

940 gwjob = GenericWorkflowJob(job_name, label=job_name) 

941 

942 search_opt = {"searchobj": config[job_name], "curvals": {}, "default": None} 

943 found, value = config.search("computeSite", opt=search_opt) 

944 if found: 

945 search_opt["curvals"]["curr_site"] = value 

946 found, value = config.search("computeCloud", opt=search_opt) 

947 if found: 

948 search_opt["curvals"]["curr_cloud"] = value 

949 

950 # Set job attributes based on the values find in the config excluding 

951 # the ones in the _ATTRS_MISC group. The attributes in this group are 

952 # somewhat "special": 

953 # * HTCondor plugin, which uses 'attrs' and 'profile', has its own 

954 # mechanism for setting them, 

955 # * 'cmdvals' is being set internally, not via config. 

956 job_values = _get_job_values(config, search_opt, None) 

957 for attr in _ATTRS_ALL - _ATTRS_MISC: 

958 if not getattr(gwjob, attr) and job_values.get(attr, None): 

959 setattr(gwjob, attr, job_values[attr]) 

960 

961 # Create script and add command line to job. 

962 gwjob.executable, gwjob.arguments = create_cmd(config, prefix) 

963 

964 # Determine inputs from command line. 

965 for file_key in re.findall(r"<FILE:([^>]+)>", gwjob.arguments): 

966 gwfile = generic_workflow.get_file(file_key) 

967 generic_workflow.add_job_inputs(gwjob.name, gwfile) 

968 

969 _enhance_command(config, generic_workflow, gwjob, {}) 

970 return gwjob 

971 

972 return create_final_job 

973 

974 

975def _create_final_command(config, prefix): 

976 """Create the command and shell script for the final job. 

977 

978 Parameters 

979 ---------- 

980 config : `lsst.ctrl.bps.BpsConfig` 

981 Bps configuration. 

982 prefix : `str` 

983 Directory in which to output final script. 

984 

985 Returns 

986 ------- 

987 executable : `lsst.ctrl.bps.GenericWorkflowExec` 

988 Executable object for the final script. 

989 arguments : `str` 

990 Command line needed to call the final script. 

991 """ 

992 search_opt = { 

993 "replaceVars": False, 

994 "replaceEnvVars": False, 

995 "expandEnvVars": False, 

996 "searchobj": config["finalJob"], 

997 } 

998 

999 script_file = os.path.join(prefix, "final_job.bash") 

1000 with open(script_file, "w", encoding="utf8") as fh: 

1001 print("#!/bin/bash\n", file=fh) 

1002 print("set -e", file=fh) 

1003 print("set -x", file=fh) 

1004 

1005 print("qgraphFile=$1", file=fh) 

1006 print("butlerConfig=$2", file=fh) 

1007 

1008 i = 1 

1009 found, command = config.search(f"command{i}", opt=search_opt) 

1010 while found: 

1011 # Temporarily replace any env vars so formatter doesn't try to 

1012 # replace them. 

1013 command = re.sub(r"\${([^}]+)}", r"<BPSTMP:\1>", command) 

1014 

1015 # butlerConfig will be args to script and set to env vars 

1016 command = command.replace("{qgraphFile}", "<BPSTMP:qgraphFile>") 

1017 command = command.replace("{butlerConfig}", "<BPSTMP:butlerConfig>") 

1018 

1019 # Replace all other vars in command string 

1020 search_opt["replaceVars"] = True 

1021 command = config.formatter.format(command, config, search_opt) 

1022 search_opt["replaceVars"] = False 

1023 

1024 # Replace any temporary env placeholders. 

1025 command = re.sub(r"<BPSTMP:([^>]+)>", r"${\1}", command) 

1026 

1027 print(command, file=fh) 

1028 i += 1 

1029 found, command = config.search(f"command{i}", opt=search_opt) 

1030 os.chmod(script_file, 0o755) 

1031 executable = GenericWorkflowExec(os.path.basename(script_file), script_file, True) 

1032 

1033 _, orig_butler = config.search("butlerConfig") 

1034 return executable, f"<FILE:runQgraphFile> {orig_butler}" 

1035 

1036 

1037def _create_merge_command(config, prefix): 

1038 """Create the command and shell script for merging the execution Butler. 

1039 

1040 Parameters 

1041 ---------- 

1042 config : `lsst.ctrl.bps.BpsConfig` 

1043 Bps configuration. 

1044 prefix : `str` 

1045 Directory in which to output final script. 

1046 

1047 Returns 

1048 ------- 

1049 executable : `lsst.ctrl.bps.GenericWorkflowExec` 

1050 Executable object for the final script. 

1051 arguments : `str` 

1052 Command line needed to call the final script. 

1053 """ 

1054 search_opt = { 

1055 "replaceVars": False, 

1056 "replaceEnvVars": False, 

1057 "expandEnvVars": False, 

1058 "searchobj": config["executionButler"], 

1059 } 

1060 

1061 script_file = os.path.join(prefix, "final_job.bash") 

1062 with open(script_file, "w", encoding="utf8") as fh: 

1063 print("#!/bin/bash\n", file=fh) 

1064 print("set -e", file=fh) 

1065 print("set -x", file=fh) 

1066 

1067 print("butlerConfig=$1", file=fh) 

1068 print("executionButlerDir=$2", file=fh) 

1069 

1070 i = 1 

1071 found, command = config.search(f"command{i}", opt=search_opt) 

1072 while found: 

1073 # Temporarily replace any env vars so formatter doesn't try to 

1074 # replace them. 

1075 command = re.sub(r"\${([^}]+)}", r"<BPSTMP:\1>", command) 

1076 

1077 # executionButlerDir and butlerConfig will be args to script and 

1078 # set to env vars 

1079 command = command.replace("{executionButlerDir}", "<BPSTMP:executionButlerDir>") 

1080 command = command.replace("{butlerConfig}", "<BPSTMP:butlerConfig>") 

1081 

1082 # Replace all other vars in command string 

1083 search_opt["replaceVars"] = True 

1084 command = config.formatter.format(command, config, search_opt) 

1085 search_opt["replaceVars"] = False 

1086 

1087 # Replace any temporary env placeholders. 

1088 command = re.sub(r"<BPSTMP:([^>]+)>", r"${\1}", command) 

1089 

1090 print(command, file=fh) 

1091 i += 1 

1092 found, command = config.search(f"command{i}", opt=search_opt) 

1093 os.chmod(script_file, 0o755) 

1094 executable = GenericWorkflowExec(os.path.basename(script_file), script_file, True) 

1095 

1096 _, orig_butler = config.search("butlerConfig") 

1097 # The execution butler was saved as butlerConfig in the workflow. 

1098 return executable, f"{orig_butler} <FILE:butlerConfig>" 

1099 

1100 

1101def add_final_job_as_sink(generic_workflow, final_job): 

1102 """Add final job as the single sink for the workflow. 

1103 

1104 Parameters 

1105 ---------- 

1106 generic_workflow : `lsst.ctrl.bps.GenericWorkflow` 

1107 Generic workflow to which attributes should be added. 

1108 final_job : `lsst.ctrl.bps.GenericWorkflowJob` 

1109 Job to add as new sink node depending upon all previous sink nodes. 

1110 """ 

1111 # Find sink nodes of generic workflow graph. 

1112 gw_sinks = [n for n in generic_workflow if generic_workflow.out_degree(n) == 0] 

1113 _LOG.debug("gw_sinks = %s", gw_sinks) 

1114 

1115 generic_workflow.add_job(final_job) 

1116 generic_workflow.add_job_relationships(gw_sinks, final_job.name)