Coverage for python/lsst/ctrl/bps/panda/idds_tasks.py: 26%

239 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-01-26 02:16 -0800

1# This file is part of ctrl_bps_panda. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21import logging 

22import os.path 

23from dataclasses import dataclass 

24 

25from lsst.ctrl.bps import GenericWorkflow, GenericWorkflowJob 

26from lsst.ctrl.bps.panda.cmd_line_embedder import CommandLineEmbedder 

27 

28_LOG = logging.getLogger(__name__) 

29 

30 

31@dataclass 

32class FileDescriptor: 

33 """Holds parameters needed to define a file used by a job of task""" 

34 

35 name: str = None 

36 """Name of the file""" 

37 distribution_url: str = None 

38 """The location (URL) where this file to be distributed to the edge node""" 

39 submission_url: str = None 

40 """Path to file on the submission node""" 

41 direct_IO: bool = False 

42 """Is the file to be used remotely""" 

43 delivered: bool = False 

44 """Is is this file has been delivered to the distribution endpoint""" 

45 

46 

47@dataclass 

48class RubinTask: 

49 """Holds parameters needed to define a PanDA task""" 

50 

51 name: str = None 

52 """Name of the task""" 

53 step: str = None 

54 """Processing step""" 

55 queue: str = None 

56 """Computing queue where the task to be submitted""" 

57 executable: str = None 

58 """The task command line to be executed""" 

59 max_walltime: int = None 

60 """Maximum allowed walltime in seconds""" 

61 max_attempt: int = None 

62 """Maximum number of jobs attempts in a task""" 

63 max_rss: int = None 

64 """Maximum size of RAM to be used by a job""" 

65 cloud: str = None 

66 """Computing cloud in CRIC registry where the task should 

67 be submitted to""" 

68 site: str = None 

69 """Computing site in CRIC registry where the task should 

70 be submitted to""" 

71 core_count: int = 1 

72 """Number of CPU cores to be used by a job""" 

73 working_group: str = None 

74 """Group for accounting""" 

75 priority: int = 0 

76 """Task priority""" 

77 processing_type: str = None 

78 """Task processing type such as simulation, reconstruction""" 

79 task_type: str = None 

80 """The type of the task, such as production, analysis""" 

81 prod_source_label: str = "managed" 

82 """Label to manage production jobs and test jobs. Its value 

83 can be 'managed' and 'test'""" 

84 vo: str = "Rubin" 

85 """Virtual organization name""" 

86 jobs_pseudo_inputs: list = None 

87 """Name of preudo input to be used by task and defining jobs""" 

88 files_used_by_task: list = None 

89 """List of physical files necessary for running a task""" 

90 dependencies: list = None 

91 """List of upstream tasks and its pseudoinput parameters 

92 needed for running jobs in this task""" 

93 is_final: bool = False 

94 """Is this a finalization task""" 

95 is_dag_end: bool = False 

96 """Is this task is on the end of the DAG""" 

97 

98 

99class IDDSWorkflowGenerator: 

100 """ 

101 Class generates a iDDS workflow to be submitted into PanDA. 

102 Workflow includes definition of each task and 

103 definition of dependencies for each task input. 

104 

105 Parameters 

106 ---------- 

107 bps_workflow : `lsst.ctrl.bps.GenericWorkflow` 

108 The generic workflow constructed by BPS system 

109 config : `lsst.ctrl.bps.BpsConfig` 

110 BPS configuration that includes necessary submit/runtime information, 

111 sufficiently defined in YAML file supplied in `submit` command 

112 """ 

113 

114 def __init__(self, bps_workflow, config): 

115 self.bps_workflow = bps_workflow 

116 self.bps_config = config 

117 self.jobs_steps = {} 

118 self.tasks_steps = {} 

119 self.tasks_cmd_lines = {} 

120 self.dag_end_tasks = set() 

121 self.number_of_retries = {} 

122 _, self.max_walltime = config.search("maxWalltime", opt={"default": 90000}) 

123 _, self.max_jobs_per_task = config.search("maxJobsPerTask", opt={"default": 30000}) 

124 

125 def define_task_name(self, step): 

126 """Return task name as a combination of the workflow name (unique 

127 across workflows) and processing step name. 

128 

129 Parameters 

130 ---------- 

131 step : `str` 

132 Processing step name 

133 

134 Returns 

135 ------- 

136 Task name : `str` 

137 Computed task name 

138 """ 

139 return self.bps_config["workflowName"] + "_" + step 

140 

141 def fill_input_files(self, task_name): 

142 files = [] 

143 jobs = [ 

144 job_name 

145 for job_name in self.bps_workflow 

146 if self.bps_workflow.get_job(job_name).label == self.tasks_steps[task_name] 

147 ] 

148 for job in jobs: 

149 for gwfile in self.bps_workflow.get_job_inputs(job, transfer_only=True): 

150 file = FileDescriptor() 

151 file.name = gwfile.name 

152 file.submission_url = gwfile.src_uri 

153 file.distribution_url = os.path.join( 

154 self.bps_config["fileDistributionEndPoint"], os.path.basename(gwfile.src_uri) 

155 ) 

156 file.direct_IO = gwfile.job_access_remote 

157 files.append(file) 

158 return files 

159 

160 def define_tasks(self): 

161 """Provide tasks definition sufficient for PanDA submission 

162 

163 Returns 

164 ------- 

165 tasks : `list` [`RubinTask`] 

166 Tasks filled with parameters provided in workflow configuration 

167 and generated pipeline. 

168 """ 

169 tasks = [] 

170 raw_dependency_map = self.create_raw_jobs_dependency_map() 

171 tasks_dependency_map = self.split_map_over_tasks(raw_dependency_map) 

172 tasks_dependency_map_chunked = self.split_tasks_into_chunks(tasks_dependency_map) 

173 for task_name, jobs in tasks_dependency_map_chunked.items(): 

174 task = RubinTask() 

175 task.step = task_name 

176 task.name = task.step 

177 picked_job_name = next( 

178 filter( 

179 lambda job_name: self.bps_workflow.get_job(job_name).label == self.tasks_steps[task_name], 

180 self.bps_workflow, 

181 ) 

182 ) 

183 bps_node = self.bps_workflow.get_job(picked_job_name) 

184 task.queue = bps_node.queue 

185 task.cloud = bps_node.compute_cloud 

186 task.site = bps_node.compute_site 

187 task.core_count = bps_node.request_cpus 

188 task.priority = bps_node.priority 

189 task.working_group = bps_node.accounting_group 

190 task.jobs_pseudo_inputs = list(jobs) 

191 if bps_node.number_of_retries: 

192 task.max_attempt = bps_node.number_of_retries 

193 else: 

194 task.max_attempt = self.number_of_retries.get(task_name, 3) 

195 if bps_node.request_walltime: 

196 task.max_walltime = bps_node.request_walltime 

197 else: 

198 task.max_walltime = self.max_walltime 

199 task.max_rss = bps_node.request_memory 

200 task.executable = self.tasks_cmd_lines[task_name] 

201 task.files_used_by_task = self.fill_input_files(task_name) 

202 task.is_final = False 

203 task.is_dag_end = self.tasks_steps[task_name] in self.dag_end_tasks 

204 tasks.append(task) 

205 self.add_dependencies(tasks, tasks_dependency_map_chunked) 

206 final_task = self.get_final_task() 

207 tasks.append(final_task) 

208 return tasks 

209 

210 def split_tasks_into_chunks(self, tasks_dependency_map): 

211 """If a task is going to contain jobs whose number is above a 

212 threshold this function splits such a large tasks into chunks. 

213 

214 Parameters 

215 ---------- 

216 tasks_dependency_map : `dict` 

217 dependencies dictionary with task name in key and jobs 

218 dependencies in values. The latter dict has a job input 

219 parameters (PanDA pseudo file name) in the key and dict of 

220 pairs (upstream task name) - (its PanDA pseudo file name) 

221 which defines dependency for a job. 

222 

223 Returns 

224 ------- 

225 tasks_dependency_map : `dict` 

226 dependencies dictionary with chunked tasks where its needed. 

227 """ 

228 tasks_dependency_map_chunked = {} 

229 tasks_chunked = {} 

230 

231 """At this step only tasks names are updated to distribute 

232 tasks over chunks 

233 """ 

234 for task_name, dependencies in tasks_dependency_map.items(): 

235 n_jobs_in_task = len(dependencies) 

236 if n_jobs_in_task > self.max_jobs_per_task: 

237 n_chunks = -(-n_jobs_in_task // self.max_jobs_per_task) 

238 for pseudo_input, dependency in dependencies.items(): 

239 chunk_id = hash(pseudo_input) % n_chunks 

240 task_name_chunked = self.get_task_name_with_chunk(task_name, chunk_id) 

241 tasks_dependency_map_chunked.setdefault(task_name_chunked, {})[pseudo_input] = dependency 

242 self.tasks_steps[task_name_chunked] = self.tasks_steps[task_name] 

243 self.tasks_cmd_lines[task_name_chunked] = self.tasks_cmd_lines[task_name] 

244 tasks_chunked[task_name] = n_chunks 

245 else: 

246 tasks_dependency_map_chunked[task_name] = dependencies 

247 

248 """This block propagates chunking over upstream dependencies 

249 records. 

250 """ 

251 tasks_dependency_map_chunked_updated_dep = {} 

252 for task, dependencies in tasks_dependency_map_chunked.items(): 

253 for pseudo_input, dependency in dependencies.items(): 

254 updated_dependencies = {} 

255 for upstream_task_name, pseudo_inputs in dependency.items(): 

256 if upstream_task_name in tasks_chunked: 

257 for upstream_pseudo_input in pseudo_inputs: 

258 chunk_id = hash(upstream_pseudo_input) % tasks_chunked[upstream_task_name] 

259 task_name_chunked = self.get_task_name_with_chunk(upstream_task_name, chunk_id) 

260 chunked_task_name = task_name_chunked 

261 updated_dependencies.setdefault(chunked_task_name, []).append( 

262 upstream_pseudo_input 

263 ) 

264 else: 

265 updated_dependencies.setdefault(upstream_task_name, []).extend(pseudo_inputs) 

266 tasks_dependency_map_chunked_updated_dep.setdefault(task, {}).setdefault( 

267 pseudo_input, {} 

268 ).update(updated_dependencies) 

269 return tasks_dependency_map_chunked_updated_dep 

270 

271 def get_task_name_with_chunk(self, task_name, chunk_id): 

272 """Concatenates file name and chunk ID 

273 

274 Parameters 

275 ---------- 

276 task_name : `str` 

277 The name of the task 

278 

279 chunk_id : `int` 

280 ID of the chunk 

281 

282 Returns 

283 ------- 

284 task_name : `str` 

285 Concatenated task name 

286 """ 

287 return f"{task_name}_chunk_{chunk_id}" 

288 

289 def get_final_task(self): 

290 """If final job exists in generic workflow, create DAG final task 

291 

292 Returns 

293 ------- 

294 task : `RubinTask` 

295 The final task for a workflow 

296 """ 

297 final_job = self.bps_workflow.get_final() 

298 if final_job and isinstance(final_job, GenericWorkflowJob): 

299 task = RubinTask() 

300 bash_file = FileDescriptor() 

301 bash_file.submission_url = final_job.executable.src_uri 

302 bash_file.distribution_url = os.path.join( 

303 self.bps_config["fileDistributionEndPoint"], final_job.executable.name 

304 ) 

305 task.executable = f"bash ./{final_job.executable.name} {final_job.arguments}" 

306 

307 task.step = final_job.label 

308 task.name = self.define_task_name(final_job.label) 

309 task.queue = final_job.queue 

310 task.cloud = final_job.compute_cloud 

311 task.site = final_job.compute_site 

312 task.core_count = final_job.request_cpus 

313 task.priority = final_job.priority 

314 task.working_group = final_job.accounting_group 

315 task.jobs_pseudo_inputs = [] 

316 

317 # This string implements empty pattern for dependencies 

318 task.dependencies = [ 

319 {"name": "pure_pseudoinput+qgraphNodeId:+qgraphId:", "submitted": False, "dependencies": []} 

320 ] 

321 

322 if final_job.number_of_retries: 

323 task.max_attempt = final_job.number_of_retries 

324 else: 

325 task.max_attempt = self.number_of_retries.get(task.name, 3) 

326 if final_job.request_walltime: 

327 task.max_walltime = final_job.request_walltime 

328 else: 

329 task.max_walltime = self.max_walltime 

330 task.max_rss = final_job.request_memory 

331 task.files_used_by_task = [bash_file] 

332 task.is_final = True 

333 task.is_dag_end = False 

334 return task 

335 elif final_job and isinstance(final_job, GenericWorkflow): 

336 raise NotImplementedError("PanDA plugin does not support a workflow as the final job") 

337 elif final_job: 

338 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final_job)})") 

339 

340 def add_dependencies(self, tasks, tasks_dependency_map): 

341 """Add the dependency list to a task definition. This list defines all 

342 inputs of a task and how that inputs depend on upstream processing 

343 steps 

344 

345 Parameters 

346 ---------- 

347 tasks : `list` [`RubinTask`] 

348 Tasks to be filled with dependency information 

349 

350 tasks_dependency_map : `dict` of dependencies dictionary 

351 

352 Returns 

353 ------- 

354 Method modifies items in the tasks list provided as an argument 

355 """ 

356 for task in tasks: 

357 jobs = tasks_dependency_map[task.step] 

358 task.dependencies = [] 

359 for job, job_dependency in jobs.items(): 

360 job_dep = { 

361 "name": job, 

362 "submitted": False, 

363 } 

364 input_files_dependencies = [] 

365 for taskname, files in job_dependency.items(): 

366 for file in files: 

367 input_files_dependencies.append( 

368 {"task": taskname, "inputname": file, "available": False} 

369 ) 

370 job_dep["dependencies"] = input_files_dependencies 

371 task.dependencies.append(job_dep) 

372 

373 def create_raw_jobs_dependency_map(self): 

374 """Compute the DAG nodes dependency map (node - list of nodes) for each 

375 node in the workflow DAG 

376 

377 Returns 

378 ------- 

379 dependency_map : `dict` of `node-dependencies` pairs. 

380 For each node in workflow DAG computed its dependencies (other 

381 nodes). 

382 """ 

383 

384 dependency_map = {} 

385 cmd_line_embedder = CommandLineEmbedder(self.bps_config) 

386 

387 for job_name in self.bps_workflow: 

388 gwjob = self.bps_workflow.get_job(job_name) 

389 cmd_line, pseudo_file_name = cmd_line_embedder.substitute_command_line( 

390 gwjob.executable.src_uri + " " + gwjob.arguments, gwjob.cmdvals, job_name 

391 ) 

392 if len(pseudo_file_name) > 4000: 

393 _LOG.error(f"pseudo_file_name: {pseudo_file_name}") 

394 raise NameError( 

395 "job pseudo input file name contains more than 4000 symbols. Can not proceed." 

396 ) 

397 

398 task_name_for_label = self.define_task_name(gwjob.label) 

399 self.tasks_cmd_lines[task_name_for_label] = cmd_line 

400 self.jobs_steps[pseudo_file_name] = gwjob.label 

401 if gwjob.number_of_retries: 

402 self.number_of_retries[task_name_for_label] = gwjob.number_of_retries 

403 dependency_map[pseudo_file_name] = [] 

404 predecessors = self.bps_workflow.predecessors(job_name) 

405 for parent_name in predecessors: 

406 parent_job = self.bps_workflow.get_job(parent_name) 

407 cmd_line_parent, pseudo_file_parent = cmd_line_embedder.substitute_command_line( 

408 parent_job.executable.src_uri + " " + parent_job.arguments, 

409 parent_job.cmdvals, 

410 parent_name, 

411 ) 

412 dependency_map.get(pseudo_file_name).append(pseudo_file_parent) 

413 

414 successors = self.bps_workflow.successors(job_name) 

415 if next(successors, None) is None: 

416 self.dag_end_tasks.add(gwjob.label) 

417 return dependency_map 

418 

419 def split_map_over_tasks(self, raw_dependency_map): 

420 """Group nodes performing same operations into tasks. For each task 

421 define inputs and its dependencies. 

422 

423 This is a structure to be filled out in function taskname: :: 

424 

425 dependencies = [ 

426 { 

427 "name": "filename0", 

428 "dependencies": [ 

429 { 

430 "task": "task1", 

431 "inputname":"filename0", 

432 "available": False" 

433 }, 

434 ], 

435 "submitted": False 

436 } 

437 ] 

438 

439 Parameters 

440 ---------- 

441 raw_dependency_map : `dict` 

442 Pairs node-list of directly connected upstream nodes 

443 

444 Returns 

445 ------- 

446 tasks_dependency_map : `dict` [`str`, `list`] 

447 Dict of tasks/correspondent dependencies 

448 """ 

449 tasks_dependency_map = {} 

450 for job, dependency in raw_dependency_map.items(): 

451 task_name = self.define_task_name(self.jobs_steps[job]) 

452 tasks_dependency_map.setdefault(task_name, {})[job] = self.split_dependencies_by_tasks(dependency) 

453 self.tasks_steps[task_name] = self.jobs_steps[job] 

454 return tasks_dependency_map 

455 

456 def get_task_by_job_name(self, job_name): 

457 return job_name.split("_")[1] if len(job_name.split("_")) > 1 else job_name 

458 

459 def split_dependencies_by_tasks(self, dependencies): 

460 """Group the list of dependencies by tasks where dependencies comes 

461 from. 

462 

463 Parameters 

464 ---------- 

465 dependencies : `list` [`dicts`] 

466 Each dictionary in the list contains information about 

467 dependency: task,inputname,available 

468 

469 Returns 

470 ------- 

471 dependencies_by_tasks : `dict` [`str`, `list`] 

472 Dict of tasks/dependency files comes from that task. 

473 """ 

474 dependencies_by_tasks = {} 

475 for dependency in dependencies: 

476 dependencies_by_tasks.setdefault(self.define_task_name(self.jobs_steps[dependency]), []).append( 

477 dependency 

478 ) 

479 return dependencies_by_tasks 

480 

481 def get_input_file(self, job_name): 

482 """Extract the quantum graph file needed for a job. 

483 

484 Parameters 

485 ---------- 

486 job_name: `str` 

487 The name of the node in workflow DAG. 

488 

489 Returns 

490 ------- 

491 quantum graph file name 

492 """ 

493 return next(iter(self.bps_workflow.nodes.get(job_name).get("inputs")))