Coverage for python/lsst/ctrl/bps/panda/idds_tasks.py: 25%

202 statements  

« prev     ^ index     » next       coverage.py v6.4.1, created at 2022-06-18 02:48 -0700

1# This file is part of ctrl_bps_panda. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21import os.path 

22from dataclasses import dataclass 

23 

24from lsst.ctrl.bps import GenericWorkflow, GenericWorkflowJob 

25from lsst.ctrl.bps.panda.cmd_line_embedder import CommandLineEmbedder 

26 

27 

28@dataclass 

29class FileDescriptor: 

30 """Holds parameters needed to define a file used by a job of task""" 

31 

32 name: str = None 

33 """Name of the file""" 

34 distribution_url: str = None 

35 """The location (URL) where this file to be distributed to the edge node""" 

36 submission_url: str = None 

37 """Path to file on the submission node""" 

38 direct_IO: bool = False 

39 """Is the file to be used remotely""" 

40 delivered: bool = False 

41 """Is is this file has been delivered to the distribution endpoint""" 

42 

43 

44@dataclass 

45class RubinTask: 

46 """Holds parameters needed to define a PanDA task""" 

47 

48 name: str = None 

49 """Name of the task""" 

50 step: str = None 

51 """Processing step""" 

52 queue: str = None 

53 """Computing queue where the task to be submitted""" 

54 executable: str = None 

55 """The task command line to be executed""" 

56 max_walltime: int = None 

57 """Maximum allowed walltime in seconds""" 

58 max_attempt: int = None 

59 """Maximum number of jobs attempts in a task""" 

60 max_rss: int = None 

61 """Maximum size of RAM to be used by a job""" 

62 cloud: str = None 

63 """Computing cloud in CRIC registry where the task should 

64 be submitted to""" 

65 jobs_pseudo_inputs: list = None 

66 """Name of preudo input to be used by task and defining jobs""" 

67 files_used_by_task: list = None 

68 """List of physical files necessary for running a task""" 

69 dependencies: list = None 

70 """List of upstream tasks and its pseudoinput parameters 

71 needed for running jobs in this task""" 

72 is_final: bool = False 

73 """Is this a finalization task""" 

74 is_dag_end: bool = False 

75 """Is this task is on the end of the DAG""" 

76 

77 

78class IDDSWorkflowGenerator: 

79 """ 

80 Class generates a iDDS workflow to be submitted into PanDA. 

81 Workflow includes definition of each task and 

82 definition of dependencies for each task input. 

83 

84 Parameters 

85 ---------- 

86 bps_workflow : `lsst.ctrl.bps.GenericWorkflow` 

87 The generic workflow constructed by BPS system 

88 config : `lsst.ctrl.bps.BpsConfig` 

89 BPS configuration that includes necessary submit/runtime information, 

90 sufficiently defined in YAML file supplied in `submit` command 

91 """ 

92 

93 def __init__(self, bps_workflow, config): 

94 self.bps_workflow = bps_workflow 

95 self.bps_config = config 

96 self.jobs_steps = {} 

97 self.tasks_steps = {} 

98 self.tasks_cmd_lines = {} 

99 self.dag_end_tasks = set() 

100 self.number_of_retries = {} 

101 _, self.max_walltime = config.search("maxWalltime", opt={"default": 90000}) 

102 _, self.max_jobs_per_task = config.search("maxJobsPerTask", opt={"default": 30000}) 

103 

104 def define_task_name(self, step): 

105 """Return task name as a combination of the workflow name (unique 

106 across workflows) and processing step name. 

107 

108 Parameters 

109 ---------- 

110 step : `str` 

111 Processing step name 

112 

113 Returns 

114 ------- 

115 Task name : `str` 

116 Computed task name 

117 """ 

118 return self.bps_config["workflowName"] + "_" + step 

119 

120 def fill_input_files(self, task_name): 

121 files = [] 

122 jobs = [ 

123 job_name 

124 for job_name in self.bps_workflow 

125 if self.bps_workflow.get_job(job_name).label == self.tasks_steps[task_name] 

126 ] 

127 for job in jobs: 

128 for gwfile in self.bps_workflow.get_job_inputs(job, transfer_only=True): 

129 file = FileDescriptor() 

130 file.name = gwfile.name 

131 file.submission_url = gwfile.src_uri 

132 file.distribution_url = os.path.join( 

133 self.bps_config["fileDistributionEndPoint"], os.path.basename(gwfile.src_uri) 

134 ) 

135 file.direct_IO = gwfile.job_access_remote 

136 files.append(file) 

137 return files 

138 

139 def define_tasks(self): 

140 """Provide tasks definition sufficient for PanDA submission 

141 

142 Returns 

143 ------- 

144 tasks : `list` [`RubinTask`] 

145 Tasks filled with parameters provided in workflow configuration 

146 and generated pipeline. 

147 """ 

148 tasks = [] 

149 raw_dependency_map = self.create_raw_jobs_dependency_map() 

150 tasks_dependency_map = self.split_map_over_tasks(raw_dependency_map) 

151 tasks_dependency_map_chunked = self.split_tasks_into_chunks(tasks_dependency_map) 

152 for task_name, jobs in tasks_dependency_map_chunked.items(): 

153 task = RubinTask() 

154 task.step = task_name 

155 task.name = task.step 

156 picked_job_name = next( 

157 filter( 

158 lambda job_name: self.bps_workflow.get_job(job_name).label == self.tasks_steps[task_name], 

159 self.bps_workflow, 

160 ) 

161 ) 

162 bps_node = self.bps_workflow.get_job(picked_job_name) 

163 task.queue = bps_node.queue 

164 task.cloud = bps_node.compute_site 

165 task.jobs_pseudo_inputs = list(jobs) 

166 task.max_attempt = self.number_of_retries.get(task_name, 3) 

167 task.max_walltime = self.max_walltime 

168 task.max_rss = bps_node.request_memory 

169 task.executable = self.tasks_cmd_lines[task_name] 

170 task.files_used_by_task = self.fill_input_files(task_name) 

171 task.is_final = False 

172 task.is_dag_end = self.tasks_steps[task_name] in self.dag_end_tasks 

173 tasks.append(task) 

174 self.add_dependencies(tasks, tasks_dependency_map_chunked) 

175 final_task = self.get_final_task() 

176 tasks.append(final_task) 

177 return tasks 

178 

179 def split_tasks_into_chunks(self, tasks_dependency_map): 

180 """If a task is going to contain jobs whose number is above a 

181 threshold this function splits such a large tasks into chunks. 

182 

183 Parameters 

184 ---------- 

185 tasks_dependency_map : `dict` 

186 dependencies dictionary with task name in key and jobs 

187 dependencies in values. The latter dict has a job input 

188 parameters (PanDA pseudo file name) in the key and dict of 

189 pairs (upstream task name) - (its PanDA pseudo file name) 

190 which defines dependency for a job. 

191 

192 Returns 

193 ------- 

194 tasks_dependency_map : `dict` 

195 dependencies dictionary with chunked tasks where its needed. 

196 """ 

197 tasks_dependency_map_chunked = {} 

198 tasks_chunked = {} 

199 

200 """At this step only tasks names are updated to distribute 

201 tasks over chunks 

202 """ 

203 for task_name, dependencies in tasks_dependency_map.items(): 

204 n_jobs_in_task = len(dependencies) 

205 if n_jobs_in_task > self.max_jobs_per_task: 

206 n_chunks = -(-n_jobs_in_task // self.max_jobs_per_task) 

207 for pseudo_input, dependency in dependencies.items(): 

208 chunk_id = hash(pseudo_input) % n_chunks 

209 task_name_chunked = self.get_task_name_with_chunk(task_name, chunk_id) 

210 tasks_dependency_map_chunked.setdefault(task_name_chunked, {})[pseudo_input] = dependency 

211 self.tasks_steps[task_name_chunked] = self.tasks_steps[task_name] 

212 self.tasks_cmd_lines[task_name_chunked] = self.tasks_cmd_lines[task_name] 

213 tasks_chunked[task_name] = n_chunks 

214 else: 

215 tasks_dependency_map_chunked[task_name] = dependencies 

216 

217 """This block propagates chunking over upstream dependencies 

218 records. 

219 """ 

220 tasks_dependency_map_chunked_updated_dep = {} 

221 for task, dependencies in tasks_dependency_map_chunked.items(): 

222 for pseudo_input, dependency in dependencies.items(): 

223 updated_dependencies = {} 

224 for upstream_task_name, pseudo_inputs in dependency.items(): 

225 if upstream_task_name in tasks_chunked: 

226 for upstream_pseudo_input in pseudo_inputs: 

227 chunk_id = hash(upstream_pseudo_input) % tasks_chunked[upstream_task_name] 

228 task_name_chunked = self.get_task_name_with_chunk(upstream_task_name, chunk_id) 

229 chunked_task_name = task_name_chunked 

230 updated_dependencies.setdefault(chunked_task_name, []).append( 

231 upstream_pseudo_input 

232 ) 

233 else: 

234 updated_dependencies.setdefault(upstream_task_name, []).extend(pseudo_inputs) 

235 tasks_dependency_map_chunked_updated_dep.setdefault(task, {}).setdefault( 

236 pseudo_input, {} 

237 ).update(updated_dependencies) 

238 return tasks_dependency_map_chunked_updated_dep 

239 

240 def get_task_name_with_chunk(self, task_name, chunk_id): 

241 """Concatenates file name and chunk ID 

242 

243 Parameters 

244 ---------- 

245 task_name : `str` 

246 The name of the task 

247 

248 chunk_id : `int` 

249 ID of the chunk 

250 

251 Returns 

252 ------- 

253 task_name : `str` 

254 Concatenated task name 

255 """ 

256 return f"{task_name}_chunk_{chunk_id}" 

257 

258 def get_final_task(self): 

259 """If final job exists in generic workflow, create DAG final task 

260 

261 Returns 

262 ------- 

263 task : `RubinTask` 

264 The final task for a workflow 

265 """ 

266 final_job = self.bps_workflow.get_final() 

267 if final_job and isinstance(final_job, GenericWorkflowJob): 

268 task = RubinTask() 

269 bash_file = FileDescriptor() 

270 bash_file.submission_url = final_job.executable.src_uri 

271 bash_file.distribution_url = os.path.join( 

272 self.bps_config["fileDistributionEndPoint"], final_job.executable.name 

273 ) 

274 task.executable = f"bash ./{final_job.executable.name} {final_job.arguments}" 

275 

276 task.step = final_job.label 

277 task.name = self.define_task_name(final_job.label) 

278 task.queue = final_job.queue 

279 task.cloud = final_job.compute_site 

280 task.jobs_pseudo_inputs = [] 

281 

282 # This string implements empty pattern for dependencies 

283 task.dependencies = [ 

284 {"name": "pure_pseudoinput+qgraphNodeId:+qgraphId:", "submitted": False, "dependencies": []} 

285 ] 

286 

287 task.max_attempt = self.number_of_retries.get(task.name, 3) 

288 task.max_walltime = self.max_walltime 

289 task.max_rss = final_job.request_memory 

290 task.files_used_by_task = [bash_file] 

291 task.is_final = True 

292 task.is_dag_end = False 

293 return task 

294 elif final_job and isinstance(final_job, GenericWorkflow): 

295 raise NotImplementedError("PanDA plugin does not support a workflow as the final job") 

296 elif final_job: 

297 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final_job)})") 

298 

299 def add_dependencies(self, tasks, tasks_dependency_map): 

300 """Add the dependency list to a task definition. This list defines all 

301 inputs of a task and how that inputs depend on upstream processing 

302 steps 

303 

304 Parameters 

305 ---------- 

306 tasks : `list` [`RubinTask`] 

307 Tasks to be filled with dependency information 

308 

309 tasks_dependency_map : `dict` of dependencies dictionary 

310 

311 Returns 

312 ------- 

313 Method modifies items in the tasks list provided as an argument 

314 """ 

315 for task in tasks: 

316 jobs = tasks_dependency_map[task.step] 

317 task.dependencies = [] 

318 for job, job_dependency in jobs.items(): 

319 job_dep = { 

320 "name": job, 

321 "submitted": False, 

322 } 

323 input_files_dependencies = [] 

324 for taskname, files in job_dependency.items(): 

325 for file in files: 

326 input_files_dependencies.append( 

327 {"task": taskname, "inputname": file, "available": False} 

328 ) 

329 job_dep["dependencies"] = input_files_dependencies 

330 task.dependencies.append(job_dep) 

331 

332 def create_raw_jobs_dependency_map(self): 

333 """Compute the DAG nodes dependency map (node - list of nodes) for each 

334 node in the workflow DAG 

335 

336 Returns 

337 ------- 

338 dependency_map : `dict` of `node-dependencies` pairs. 

339 For each node in workflow DAG computed its dependencies (other 

340 nodes). 

341 """ 

342 

343 dependency_map = {} 

344 cmd_line_embedder = CommandLineEmbedder(self.bps_config) 

345 

346 for job_name in self.bps_workflow: 

347 gwjob = self.bps_workflow.get_job(job_name) 

348 cmd_line, pseudo_file_name = cmd_line_embedder.substitute_command_line( 

349 gwjob.executable.src_uri + " " + gwjob.arguments, gwjob.cmdvals, job_name 

350 ) 

351 if len(pseudo_file_name) > 4000: 

352 raise NameError( 

353 "job pseudo input file name contains more than 4000 symbols. Can not proceed." 

354 ) 

355 

356 self.tasks_cmd_lines[self.define_task_name(gwjob.label)] = cmd_line 

357 self.jobs_steps[pseudo_file_name] = gwjob.label 

358 self.number_of_retries[self.define_task_name(gwjob.label)] = gwjob.number_of_retries 

359 dependency_map[pseudo_file_name] = [] 

360 predecessors = self.bps_workflow.predecessors(job_name) 

361 for parent_name in predecessors: 

362 parent_job = self.bps_workflow.get_job(parent_name) 

363 cmd_line_parent, pseudo_file_parent = cmd_line_embedder.substitute_command_line( 

364 parent_job.executable.src_uri + " " + parent_job.arguments, 

365 parent_job.cmdvals, 

366 parent_name, 

367 ) 

368 dependency_map.get(pseudo_file_name).append(pseudo_file_parent) 

369 

370 successors = self.bps_workflow.successors(job_name) 

371 if next(successors, None) is None: 

372 self.dag_end_tasks.add(gwjob.label) 

373 return dependency_map 

374 

375 def split_map_over_tasks(self, raw_dependency_map): 

376 """Group nodes performing same operations into tasks. For each task 

377 define inputs and its dependencies. 

378 

379 This is a structure to be filled out in function taskname: :: 

380 

381 dependencies = [ 

382 { 

383 "name": "filename0", 

384 "dependencies": [ 

385 { 

386 "task": "task1", 

387 "inputname":"filename0", 

388 "available": False" 

389 }, 

390 ], 

391 "submitted": False 

392 } 

393 ] 

394 

395 Parameters 

396 ---------- 

397 raw_dependency_map : `dict` 

398 Pairs node-list of directly connected upstream nodes 

399 

400 Returns 

401 ------- 

402 tasks_dependency_map : `dict` [`str`, `list`] 

403 Dict of tasks/correspondent dependencies 

404 """ 

405 tasks_dependency_map = {} 

406 for job, dependency in raw_dependency_map.items(): 

407 task_name = self.define_task_name(self.jobs_steps[job]) 

408 tasks_dependency_map.setdefault(task_name, {})[job] = self.split_dependencies_by_tasks(dependency) 

409 self.tasks_steps[task_name] = self.jobs_steps[job] 

410 return tasks_dependency_map 

411 

412 def get_task_by_job_name(self, job_name): 

413 return job_name.split("_")[1] if len(job_name.split("_")) > 1 else job_name 

414 

415 def split_dependencies_by_tasks(self, dependencies): 

416 """Group the list of dependencies by tasks where dependencies comes 

417 from. 

418 

419 Parameters 

420 ---------- 

421 dependencies : `list` [`dicts`] 

422 Each dictionary in the list contains information about 

423 dependency: task,inputname,available 

424 

425 Returns 

426 ------- 

427 dependencies_by_tasks : `dict` [`str`, `list`] 

428 Dict of tasks/dependency files comes from that task. 

429 """ 

430 dependencies_by_tasks = {} 

431 for dependency in dependencies: 

432 dependencies_by_tasks.setdefault(self.define_task_name(self.jobs_steps[dependency]), []).append( 

433 dependency 

434 ) 

435 return dependencies_by_tasks 

436 

437 def get_input_file(self, job_name): 

438 """Extract the quantum graph file needed for a job. 

439 

440 Parameters 

441 ---------- 

442 job_name: `str` 

443 The name of the node in workflow DAG. 

444 

445 Returns 

446 ------- 

447 quantum graph file name 

448 """ 

449 return next(iter(self.bps_workflow.nodes.get(job_name).get("inputs")))