Coverage for python/lsst/ctrl/bps/panda/idds_tasks.py: 26%

236 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-07-13 03:02 -0700

1# This file is part of ctrl_bps_panda. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21import os.path 

22from dataclasses import dataclass 

23 

24from lsst.ctrl.bps import GenericWorkflow, GenericWorkflowJob 

25from lsst.ctrl.bps.panda.cmd_line_embedder import CommandLineEmbedder 

26 

27 

28@dataclass 

29class FileDescriptor: 

30 """Holds parameters needed to define a file used by a job of task""" 

31 

32 name: str = None 

33 """Name of the file""" 

34 distribution_url: str = None 

35 """The location (URL) where this file to be distributed to the edge node""" 

36 submission_url: str = None 

37 """Path to file on the submission node""" 

38 direct_IO: bool = False 

39 """Is the file to be used remotely""" 

40 delivered: bool = False 

41 """Is is this file has been delivered to the distribution endpoint""" 

42 

43 

44@dataclass 

45class RubinTask: 

46 """Holds parameters needed to define a PanDA task""" 

47 

48 name: str = None 

49 """Name of the task""" 

50 step: str = None 

51 """Processing step""" 

52 queue: str = None 

53 """Computing queue where the task to be submitted""" 

54 executable: str = None 

55 """The task command line to be executed""" 

56 max_walltime: int = None 

57 """Maximum allowed walltime in seconds""" 

58 max_attempt: int = None 

59 """Maximum number of jobs attempts in a task""" 

60 max_rss: int = None 

61 """Maximum size of RAM to be used by a job""" 

62 cloud: str = None 

63 """Computing cloud in CRIC registry where the task should 

64 be submitted to""" 

65 site: str = None 

66 """Computing site in CRIC registry where the task should 

67 be submitted to""" 

68 core_count: int = 1 

69 """Number of CPU cores to be used by a job""" 

70 working_group: str = None 

71 """Group for accounting""" 

72 priority: int = 0 

73 """Task priority""" 

74 processing_type: str = None 

75 """Task processing type such as simulation, reconstruction""" 

76 task_type: str = None 

77 """The type of the task, such as production, analysis""" 

78 prod_source_label: str = "managed" 

79 """Label to manage production jobs and test jobs. Its value 

80 can be 'managed' and 'test'""" 

81 vo: str = "Rubin" 

82 """Virtual organization name""" 

83 jobs_pseudo_inputs: list = None 

84 """Name of preudo input to be used by task and defining jobs""" 

85 files_used_by_task: list = None 

86 """List of physical files necessary for running a task""" 

87 dependencies: list = None 

88 """List of upstream tasks and its pseudoinput parameters 

89 needed for running jobs in this task""" 

90 is_final: bool = False 

91 """Is this a finalization task""" 

92 is_dag_end: bool = False 

93 """Is this task is on the end of the DAG""" 

94 

95 

96class IDDSWorkflowGenerator: 

97 """ 

98 Class generates a iDDS workflow to be submitted into PanDA. 

99 Workflow includes definition of each task and 

100 definition of dependencies for each task input. 

101 

102 Parameters 

103 ---------- 

104 bps_workflow : `lsst.ctrl.bps.GenericWorkflow` 

105 The generic workflow constructed by BPS system 

106 config : `lsst.ctrl.bps.BpsConfig` 

107 BPS configuration that includes necessary submit/runtime information, 

108 sufficiently defined in YAML file supplied in `submit` command 

109 """ 

110 

111 def __init__(self, bps_workflow, config): 

112 self.bps_workflow = bps_workflow 

113 self.bps_config = config 

114 self.jobs_steps = {} 

115 self.tasks_steps = {} 

116 self.tasks_cmd_lines = {} 

117 self.dag_end_tasks = set() 

118 self.number_of_retries = {} 

119 _, self.max_walltime = config.search("maxWalltime", opt={"default": 90000}) 

120 _, self.max_jobs_per_task = config.search("maxJobsPerTask", opt={"default": 30000}) 

121 

122 def define_task_name(self, step): 

123 """Return task name as a combination of the workflow name (unique 

124 across workflows) and processing step name. 

125 

126 Parameters 

127 ---------- 

128 step : `str` 

129 Processing step name 

130 

131 Returns 

132 ------- 

133 Task name : `str` 

134 Computed task name 

135 """ 

136 return self.bps_config["workflowName"] + "_" + step 

137 

138 def fill_input_files(self, task_name): 

139 files = [] 

140 jobs = [ 

141 job_name 

142 for job_name in self.bps_workflow 

143 if self.bps_workflow.get_job(job_name).label == self.tasks_steps[task_name] 

144 ] 

145 for job in jobs: 

146 for gwfile in self.bps_workflow.get_job_inputs(job, transfer_only=True): 

147 file = FileDescriptor() 

148 file.name = gwfile.name 

149 file.submission_url = gwfile.src_uri 

150 file.distribution_url = os.path.join( 

151 self.bps_config["fileDistributionEndPoint"], os.path.basename(gwfile.src_uri) 

152 ) 

153 file.direct_IO = gwfile.job_access_remote 

154 files.append(file) 

155 return files 

156 

157 def define_tasks(self): 

158 """Provide tasks definition sufficient for PanDA submission 

159 

160 Returns 

161 ------- 

162 tasks : `list` [`RubinTask`] 

163 Tasks filled with parameters provided in workflow configuration 

164 and generated pipeline. 

165 """ 

166 tasks = [] 

167 raw_dependency_map = self.create_raw_jobs_dependency_map() 

168 tasks_dependency_map = self.split_map_over_tasks(raw_dependency_map) 

169 tasks_dependency_map_chunked = self.split_tasks_into_chunks(tasks_dependency_map) 

170 for task_name, jobs in tasks_dependency_map_chunked.items(): 

171 task = RubinTask() 

172 task.step = task_name 

173 task.name = task.step 

174 picked_job_name = next( 

175 filter( 

176 lambda job_name: self.bps_workflow.get_job(job_name).label == self.tasks_steps[task_name], 

177 self.bps_workflow, 

178 ) 

179 ) 

180 bps_node = self.bps_workflow.get_job(picked_job_name) 

181 task.queue = bps_node.queue 

182 task.cloud = bps_node.compute_cloud 

183 task.site = bps_node.compute_site 

184 task.core_count = bps_node.request_cpus 

185 task.priority = bps_node.priority 

186 task.working_group = bps_node.accounting_group 

187 task.jobs_pseudo_inputs = list(jobs) 

188 if bps_node.number_of_retries: 

189 task.max_attempt = bps_node.number_of_retries 

190 else: 

191 task.max_attempt = self.number_of_retries.get(task_name, 5) 

192 if bps_node.request_walltime: 

193 task.max_walltime = bps_node.request_walltime 

194 else: 

195 task.max_walltime = self.max_walltime 

196 task.max_rss = bps_node.request_memory 

197 task.executable = self.tasks_cmd_lines[task_name] 

198 task.files_used_by_task = self.fill_input_files(task_name) 

199 task.is_final = False 

200 task.is_dag_end = self.tasks_steps[task_name] in self.dag_end_tasks 

201 tasks.append(task) 

202 self.add_dependencies(tasks, tasks_dependency_map_chunked) 

203 final_task = self.get_final_task() 

204 tasks.append(final_task) 

205 return tasks 

206 

207 def split_tasks_into_chunks(self, tasks_dependency_map): 

208 """If a task is going to contain jobs whose number is above a 

209 threshold this function splits such a large tasks into chunks. 

210 

211 Parameters 

212 ---------- 

213 tasks_dependency_map : `dict` 

214 dependencies dictionary with task name in key and jobs 

215 dependencies in values. The latter dict has a job input 

216 parameters (PanDA pseudo file name) in the key and dict of 

217 pairs (upstream task name) - (its PanDA pseudo file name) 

218 which defines dependency for a job. 

219 

220 Returns 

221 ------- 

222 tasks_dependency_map : `dict` 

223 dependencies dictionary with chunked tasks where its needed. 

224 """ 

225 tasks_dependency_map_chunked = {} 

226 tasks_chunked = {} 

227 

228 """At this step only tasks names are updated to distribute 

229 tasks over chunks 

230 """ 

231 for task_name, dependencies in tasks_dependency_map.items(): 

232 n_jobs_in_task = len(dependencies) 

233 if n_jobs_in_task > self.max_jobs_per_task: 

234 n_chunks = -(-n_jobs_in_task // self.max_jobs_per_task) 

235 for pseudo_input, dependency in dependencies.items(): 

236 chunk_id = hash(pseudo_input) % n_chunks 

237 task_name_chunked = self.get_task_name_with_chunk(task_name, chunk_id) 

238 tasks_dependency_map_chunked.setdefault(task_name_chunked, {})[pseudo_input] = dependency 

239 self.tasks_steps[task_name_chunked] = self.tasks_steps[task_name] 

240 self.tasks_cmd_lines[task_name_chunked] = self.tasks_cmd_lines[task_name] 

241 tasks_chunked[task_name] = n_chunks 

242 else: 

243 tasks_dependency_map_chunked[task_name] = dependencies 

244 

245 """This block propagates chunking over upstream dependencies 

246 records. 

247 """ 

248 tasks_dependency_map_chunked_updated_dep = {} 

249 for task, dependencies in tasks_dependency_map_chunked.items(): 

250 for pseudo_input, dependency in dependencies.items(): 

251 updated_dependencies = {} 

252 for upstream_task_name, pseudo_inputs in dependency.items(): 

253 if upstream_task_name in tasks_chunked: 

254 for upstream_pseudo_input in pseudo_inputs: 

255 chunk_id = hash(upstream_pseudo_input) % tasks_chunked[upstream_task_name] 

256 task_name_chunked = self.get_task_name_with_chunk(upstream_task_name, chunk_id) 

257 chunked_task_name = task_name_chunked 

258 updated_dependencies.setdefault(chunked_task_name, []).append( 

259 upstream_pseudo_input 

260 ) 

261 else: 

262 updated_dependencies.setdefault(upstream_task_name, []).extend(pseudo_inputs) 

263 tasks_dependency_map_chunked_updated_dep.setdefault(task, {}).setdefault( 

264 pseudo_input, {} 

265 ).update(updated_dependencies) 

266 return tasks_dependency_map_chunked_updated_dep 

267 

268 def get_task_name_with_chunk(self, task_name, chunk_id): 

269 """Concatenates file name and chunk ID 

270 

271 Parameters 

272 ---------- 

273 task_name : `str` 

274 The name of the task 

275 

276 chunk_id : `int` 

277 ID of the chunk 

278 

279 Returns 

280 ------- 

281 task_name : `str` 

282 Concatenated task name 

283 """ 

284 return f"{task_name}_chunk_{chunk_id}" 

285 

286 def get_final_task(self): 

287 """If final job exists in generic workflow, create DAG final task 

288 

289 Returns 

290 ------- 

291 task : `RubinTask` 

292 The final task for a workflow 

293 """ 

294 final_job = self.bps_workflow.get_final() 

295 if final_job and isinstance(final_job, GenericWorkflowJob): 

296 task = RubinTask() 

297 bash_file = FileDescriptor() 

298 bash_file.submission_url = final_job.executable.src_uri 

299 bash_file.distribution_url = os.path.join( 

300 self.bps_config["fileDistributionEndPoint"], final_job.executable.name 

301 ) 

302 task.executable = f"bash ./{final_job.executable.name} {final_job.arguments}" 

303 

304 task.step = final_job.label 

305 task.name = self.define_task_name(final_job.label) 

306 task.queue = final_job.queue 

307 task.cloud = final_job.compute_cloud 

308 task.site = final_job.compute_site 

309 task.core_count = final_job.request_cpus 

310 task.priority = final_job.priority 

311 task.working_group = final_job.accounting_group 

312 task.jobs_pseudo_inputs = [] 

313 

314 # This string implements empty pattern for dependencies 

315 task.dependencies = [ 

316 {"name": "pure_pseudoinput+qgraphNodeId:+qgraphId:", "submitted": False, "dependencies": []} 

317 ] 

318 

319 if final_job.number_of_retries: 

320 task.max_attempt = final_job.number_of_retries 

321 else: 

322 task.max_attempt = self.number_of_retries.get(task.name, 5) 

323 if final_job.request_walltime: 

324 task.max_walltime = final_job.request_walltime 

325 else: 

326 task.max_walltime = self.max_walltime 

327 task.max_rss = final_job.request_memory 

328 task.files_used_by_task = [bash_file] 

329 task.is_final = True 

330 task.is_dag_end = False 

331 return task 

332 elif final_job and isinstance(final_job, GenericWorkflow): 

333 raise NotImplementedError("PanDA plugin does not support a workflow as the final job") 

334 elif final_job: 

335 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final_job)})") 

336 

337 def add_dependencies(self, tasks, tasks_dependency_map): 

338 """Add the dependency list to a task definition. This list defines all 

339 inputs of a task and how that inputs depend on upstream processing 

340 steps 

341 

342 Parameters 

343 ---------- 

344 tasks : `list` [`RubinTask`] 

345 Tasks to be filled with dependency information 

346 

347 tasks_dependency_map : `dict` of dependencies dictionary 

348 

349 Returns 

350 ------- 

351 Method modifies items in the tasks list provided as an argument 

352 """ 

353 for task in tasks: 

354 jobs = tasks_dependency_map[task.step] 

355 task.dependencies = [] 

356 for job, job_dependency in jobs.items(): 

357 job_dep = { 

358 "name": job, 

359 "submitted": False, 

360 } 

361 input_files_dependencies = [] 

362 for taskname, files in job_dependency.items(): 

363 for file in files: 

364 input_files_dependencies.append( 

365 {"task": taskname, "inputname": file, "available": False} 

366 ) 

367 job_dep["dependencies"] = input_files_dependencies 

368 task.dependencies.append(job_dep) 

369 

370 def create_raw_jobs_dependency_map(self): 

371 """Compute the DAG nodes dependency map (node - list of nodes) for each 

372 node in the workflow DAG 

373 

374 Returns 

375 ------- 

376 dependency_map : `dict` of `node-dependencies` pairs. 

377 For each node in workflow DAG computed its dependencies (other 

378 nodes). 

379 """ 

380 

381 dependency_map = {} 

382 cmd_line_embedder = CommandLineEmbedder(self.bps_config) 

383 

384 for job_name in self.bps_workflow: 

385 gwjob = self.bps_workflow.get_job(job_name) 

386 cmd_line, pseudo_file_name = cmd_line_embedder.substitute_command_line( 

387 gwjob.executable.src_uri + " " + gwjob.arguments, gwjob.cmdvals, job_name 

388 ) 

389 if len(pseudo_file_name) > 4000: 

390 raise NameError( 

391 "job pseudo input file name contains more than 4000 symbols. Can not proceed." 

392 ) 

393 

394 task_name_for_label = self.define_task_name(gwjob.label) 

395 self.tasks_cmd_lines[task_name_for_label] = cmd_line 

396 self.jobs_steps[pseudo_file_name] = gwjob.label 

397 if gwjob.number_of_retries: 

398 self.number_of_retries[task_name_for_label] = gwjob.number_of_retries 

399 dependency_map[pseudo_file_name] = [] 

400 predecessors = self.bps_workflow.predecessors(job_name) 

401 for parent_name in predecessors: 

402 parent_job = self.bps_workflow.get_job(parent_name) 

403 cmd_line_parent, pseudo_file_parent = cmd_line_embedder.substitute_command_line( 

404 parent_job.executable.src_uri + " " + parent_job.arguments, 

405 parent_job.cmdvals, 

406 parent_name, 

407 ) 

408 dependency_map.get(pseudo_file_name).append(pseudo_file_parent) 

409 

410 successors = self.bps_workflow.successors(job_name) 

411 if next(successors, None) is None: 

412 self.dag_end_tasks.add(gwjob.label) 

413 return dependency_map 

414 

415 def split_map_over_tasks(self, raw_dependency_map): 

416 """Group nodes performing same operations into tasks. For each task 

417 define inputs and its dependencies. 

418 

419 This is a structure to be filled out in function taskname: :: 

420 

421 dependencies = [ 

422 { 

423 "name": "filename0", 

424 "dependencies": [ 

425 { 

426 "task": "task1", 

427 "inputname":"filename0", 

428 "available": False" 

429 }, 

430 ], 

431 "submitted": False 

432 } 

433 ] 

434 

435 Parameters 

436 ---------- 

437 raw_dependency_map : `dict` 

438 Pairs node-list of directly connected upstream nodes 

439 

440 Returns 

441 ------- 

442 tasks_dependency_map : `dict` [`str`, `list`] 

443 Dict of tasks/correspondent dependencies 

444 """ 

445 tasks_dependency_map = {} 

446 for job, dependency in raw_dependency_map.items(): 

447 task_name = self.define_task_name(self.jobs_steps[job]) 

448 tasks_dependency_map.setdefault(task_name, {})[job] = self.split_dependencies_by_tasks(dependency) 

449 self.tasks_steps[task_name] = self.jobs_steps[job] 

450 return tasks_dependency_map 

451 

452 def get_task_by_job_name(self, job_name): 

453 return job_name.split("_")[1] if len(job_name.split("_")) > 1 else job_name 

454 

455 def split_dependencies_by_tasks(self, dependencies): 

456 """Group the list of dependencies by tasks where dependencies comes 

457 from. 

458 

459 Parameters 

460 ---------- 

461 dependencies : `list` [`dicts`] 

462 Each dictionary in the list contains information about 

463 dependency: task,inputname,available 

464 

465 Returns 

466 ------- 

467 dependencies_by_tasks : `dict` [`str`, `list`] 

468 Dict of tasks/dependency files comes from that task. 

469 """ 

470 dependencies_by_tasks = {} 

471 for dependency in dependencies: 

472 dependencies_by_tasks.setdefault(self.define_task_name(self.jobs_steps[dependency]), []).append( 

473 dependency 

474 ) 

475 return dependencies_by_tasks 

476 

477 def get_input_file(self, job_name): 

478 """Extract the quantum graph file needed for a job. 

479 

480 Parameters 

481 ---------- 

482 job_name: `str` 

483 The name of the node in workflow DAG. 

484 

485 Returns 

486 ------- 

487 quantum graph file name 

488 """ 

489 return next(iter(self.bps_workflow.nodes.get(job_name).get("inputs")))