Coverage for python/lsst/ctrl/bps/panda/idds_tasks.py: 25%
202 statements
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-14 03:01 -0700
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-14 03:01 -0700
1# This file is part of ctrl_bps_panda.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
21import os.path
22from dataclasses import dataclass
24from lsst.ctrl.bps import GenericWorkflow, GenericWorkflowJob
25from lsst.ctrl.bps.panda.cmd_line_embedder import CommandLineEmbedder
28@dataclass
29class FileDescriptor:
30 """Holds parameters needed to define a file used by a job of task"""
32 name: str = None
33 """Name of the file"""
34 distribution_url: str = None
35 """The location (URL) where this file to be distributed to the edge node"""
36 submission_url: str = None
37 """Path to file on the submission node"""
38 direct_IO: bool = False
39 """Is the file to be used remotely"""
40 delivered: bool = False
41 """Is is this file has been delivered to the distribution endpoint"""
44@dataclass
45class RubinTask:
46 """Holds parameters needed to define a PanDA task"""
48 name: str = None
49 """Name of the task"""
50 step: str = None
51 """Processing step"""
52 queue: str = None
53 """Computing queue where the task to be submitted"""
54 executable: str = None
55 """The task command line to be executed"""
56 max_walltime: int = None
57 """Maximum allowed walltime in seconds"""
58 max_attempt: int = None
59 """Maximum number of jobs attempts in a task"""
60 max_rss: int = None
61 """Maximum size of RAM to be used by a job"""
62 cloud: str = None
63 """Computing cloud in CRIC registry where the task should
64 be submitted to"""
65 jobs_pseudo_inputs: list = None
66 """Name of preudo input to be used by task and defining jobs"""
67 files_used_by_task: list = None
68 """List of physical files necessary for running a task"""
69 dependencies: list = None
70 """List of upstream tasks and its pseudoinput parameters
71 needed for running jobs in this task"""
72 is_final: bool = False
73 """Is this a finalization task"""
74 is_dag_end: bool = False
75 """Is this task is on the end of the DAG"""
78class IDDSWorkflowGenerator:
79 """
80 Class generates a iDDS workflow to be submitted into PanDA.
81 Workflow includes definition of each task and
82 definition of dependencies for each task input.
84 Parameters
85 ----------
86 bps_workflow : `lsst.ctrl.bps.GenericWorkflow`
87 The generic workflow constructed by BPS system
88 config : `lsst.ctrl.bps.BpsConfig`
89 BPS configuration that includes necessary submit/runtime information,
90 sufficiently defined in YAML file supplied in `submit` command
91 """
93 def __init__(self, bps_workflow, config):
94 self.bps_workflow = bps_workflow
95 self.bps_config = config
96 self.jobs_steps = {}
97 self.tasks_steps = {}
98 self.tasks_cmd_lines = {}
99 self.dag_end_tasks = set()
100 self.number_of_retries = {}
101 _, self.max_walltime = config.search("maxWalltime", opt={"default": 90000})
102 _, self.max_jobs_per_task = config.search("maxJobsPerTask", opt={"default": 30000})
104 def define_task_name(self, step):
105 """Return task name as a combination of the workflow name (unique
106 across workflows) and processing step name.
108 Parameters
109 ----------
110 step : `str`
111 Processing step name
113 Returns
114 -------
115 Task name : `str`
116 Computed task name
117 """
118 return self.bps_config["workflowName"] + "_" + step
120 def fill_input_files(self, task_name):
121 files = []
122 jobs = [
123 job_name
124 for job_name in self.bps_workflow
125 if self.bps_workflow.get_job(job_name).label == self.tasks_steps[task_name]
126 ]
127 for job in jobs:
128 for gwfile in self.bps_workflow.get_job_inputs(job, transfer_only=True):
129 file = FileDescriptor()
130 file.name = gwfile.name
131 file.submission_url = gwfile.src_uri
132 file.distribution_url = os.path.join(
133 self.bps_config["fileDistributionEndPoint"], os.path.basename(gwfile.src_uri)
134 )
135 file.direct_IO = gwfile.job_access_remote
136 files.append(file)
137 return files
139 def define_tasks(self):
140 """Provide tasks definition sufficient for PanDA submission
142 Returns
143 -------
144 tasks : `list` [`RubinTask`]
145 Tasks filled with parameters provided in workflow configuration
146 and generated pipeline.
147 """
148 tasks = []
149 raw_dependency_map = self.create_raw_jobs_dependency_map()
150 tasks_dependency_map = self.split_map_over_tasks(raw_dependency_map)
151 tasks_dependency_map_chunked = self.split_tasks_into_chunks(tasks_dependency_map)
152 for task_name, jobs in tasks_dependency_map_chunked.items():
153 task = RubinTask()
154 task.step = task_name
155 task.name = task.step
156 picked_job_name = next(
157 filter(
158 lambda job_name: self.bps_workflow.get_job(job_name).label == self.tasks_steps[task_name],
159 self.bps_workflow,
160 )
161 )
162 bps_node = self.bps_workflow.get_job(picked_job_name)
163 task.queue = bps_node.queue
164 task.cloud = bps_node.compute_site
165 task.jobs_pseudo_inputs = list(jobs)
166 task.max_attempt = self.number_of_retries.get(task_name, 3)
167 task.max_walltime = self.max_walltime
168 task.max_rss = bps_node.request_memory
169 task.executable = self.tasks_cmd_lines[task_name]
170 task.files_used_by_task = self.fill_input_files(task_name)
171 task.is_final = False
172 task.is_dag_end = self.tasks_steps[task_name] in self.dag_end_tasks
173 tasks.append(task)
174 self.add_dependencies(tasks, tasks_dependency_map_chunked)
175 final_task = self.get_final_task()
176 tasks.append(final_task)
177 return tasks
179 def split_tasks_into_chunks(self, tasks_dependency_map):
180 """If a task is going to contain jobs whose number is above a
181 threshold this function splits such a large tasks into chunks.
183 Parameters
184 ----------
185 tasks_dependency_map : `dict`
186 dependencies dictionary with task name in key and jobs
187 dependencies in values. The latter dict has a job input
188 parameters (PanDA pseudo file name) in the key and dict of
189 pairs (upstream task name) - (its PanDA pseudo file name)
190 which defines dependency for a job.
192 Returns
193 -------
194 tasks_dependency_map : `dict`
195 dependencies dictionary with chunked tasks where its needed.
196 """
197 tasks_dependency_map_chunked = {}
198 tasks_chunked = {}
200 """At this step only tasks names are updated to distribute
201 tasks over chunks
202 """
203 for task_name, dependencies in tasks_dependency_map.items():
204 n_jobs_in_task = len(dependencies)
205 if n_jobs_in_task > self.max_jobs_per_task:
206 n_chunks = -(-n_jobs_in_task // self.max_jobs_per_task)
207 for pseudo_input, dependency in dependencies.items():
208 chunk_id = hash(pseudo_input) % n_chunks
209 task_name_chunked = self.get_task_name_with_chunk(task_name, chunk_id)
210 tasks_dependency_map_chunked.setdefault(task_name_chunked, {})[pseudo_input] = dependency
211 self.tasks_steps[task_name_chunked] = self.tasks_steps[task_name]
212 self.tasks_cmd_lines[task_name_chunked] = self.tasks_cmd_lines[task_name]
213 tasks_chunked[task_name] = n_chunks
214 else:
215 tasks_dependency_map_chunked[task_name] = dependencies
217 """This block propagates chunking over upstream dependencies
218 records.
219 """
220 tasks_dependency_map_chunked_updated_dep = {}
221 for task, dependencies in tasks_dependency_map_chunked.items():
222 for pseudo_input, dependency in dependencies.items():
223 updated_dependencies = {}
224 for upstream_task_name, pseudo_inputs in dependency.items():
225 if upstream_task_name in tasks_chunked:
226 for upstream_pseudo_input in pseudo_inputs:
227 chunk_id = hash(upstream_pseudo_input) % tasks_chunked[upstream_task_name]
228 task_name_chunked = self.get_task_name_with_chunk(upstream_task_name, chunk_id)
229 chunked_task_name = task_name_chunked
230 updated_dependencies.setdefault(chunked_task_name, []).append(
231 upstream_pseudo_input
232 )
233 else:
234 updated_dependencies.setdefault(upstream_task_name, []).extend(pseudo_inputs)
235 tasks_dependency_map_chunked_updated_dep.setdefault(task, {}).setdefault(
236 pseudo_input, {}
237 ).update(updated_dependencies)
238 return tasks_dependency_map_chunked_updated_dep
240 def get_task_name_with_chunk(self, task_name, chunk_id):
241 """Concatenates file name and chunk ID
243 Parameters
244 ----------
245 task_name : `str`
246 The name of the task
248 chunk_id : `int`
249 ID of the chunk
251 Returns
252 -------
253 task_name : `str`
254 Concatenated task name
255 """
256 return f"{task_name}_chunk_{chunk_id}"
258 def get_final_task(self):
259 """If final job exists in generic workflow, create DAG final task
261 Returns
262 -------
263 task : `RubinTask`
264 The final task for a workflow
265 """
266 final_job = self.bps_workflow.get_final()
267 if final_job and isinstance(final_job, GenericWorkflowJob):
268 task = RubinTask()
269 bash_file = FileDescriptor()
270 bash_file.submission_url = final_job.executable.src_uri
271 bash_file.distribution_url = os.path.join(
272 self.bps_config["fileDistributionEndPoint"], final_job.executable.name
273 )
274 task.executable = f"bash ./{final_job.executable.name} {final_job.arguments}"
276 task.step = final_job.label
277 task.name = self.define_task_name(final_job.label)
278 task.queue = final_job.queue
279 task.cloud = final_job.compute_site
280 task.jobs_pseudo_inputs = []
282 # This string implements empty pattern for dependencies
283 task.dependencies = [
284 {"name": "pure_pseudoinput+qgraphNodeId:+qgraphId:", "submitted": False, "dependencies": []}
285 ]
287 task.max_attempt = self.number_of_retries.get(task.name, 3)
288 task.max_walltime = self.max_walltime
289 task.max_rss = final_job.request_memory
290 task.files_used_by_task = [bash_file]
291 task.is_final = True
292 task.is_dag_end = False
293 return task
294 elif final_job and isinstance(final_job, GenericWorkflow):
295 raise NotImplementedError("PanDA plugin does not support a workflow as the final job")
296 elif final_job:
297 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final_job)})")
299 def add_dependencies(self, tasks, tasks_dependency_map):
300 """Add the dependency list to a task definition. This list defines all
301 inputs of a task and how that inputs depend on upstream processing
302 steps
304 Parameters
305 ----------
306 tasks : `list` [`RubinTask`]
307 Tasks to be filled with dependency information
309 tasks_dependency_map : `dict` of dependencies dictionary
311 Returns
312 -------
313 Method modifies items in the tasks list provided as an argument
314 """
315 for task in tasks:
316 jobs = tasks_dependency_map[task.step]
317 task.dependencies = []
318 for job, job_dependency in jobs.items():
319 job_dep = {
320 "name": job,
321 "submitted": False,
322 }
323 input_files_dependencies = []
324 for taskname, files in job_dependency.items():
325 for file in files:
326 input_files_dependencies.append(
327 {"task": taskname, "inputname": file, "available": False}
328 )
329 job_dep["dependencies"] = input_files_dependencies
330 task.dependencies.append(job_dep)
332 def create_raw_jobs_dependency_map(self):
333 """Compute the DAG nodes dependency map (node - list of nodes) for each
334 node in the workflow DAG
336 Returns
337 -------
338 dependency_map : `dict` of `node-dependencies` pairs.
339 For each node in workflow DAG computed its dependencies (other
340 nodes).
341 """
343 dependency_map = {}
344 cmd_line_embedder = CommandLineEmbedder(self.bps_config)
346 for job_name in self.bps_workflow:
347 gwjob = self.bps_workflow.get_job(job_name)
348 cmd_line, pseudo_file_name = cmd_line_embedder.substitute_command_line(
349 gwjob.executable.src_uri + " " + gwjob.arguments, gwjob.cmdvals, job_name
350 )
351 if len(pseudo_file_name) > 4000:
352 raise NameError(
353 "job pseudo input file name contains more than 4000 symbols. Can not proceed."
354 )
356 self.tasks_cmd_lines[self.define_task_name(gwjob.label)] = cmd_line
357 self.jobs_steps[pseudo_file_name] = gwjob.label
358 self.number_of_retries[self.define_task_name(gwjob.label)] = gwjob.number_of_retries
359 dependency_map[pseudo_file_name] = []
360 predecessors = self.bps_workflow.predecessors(job_name)
361 for parent_name in predecessors:
362 parent_job = self.bps_workflow.get_job(parent_name)
363 cmd_line_parent, pseudo_file_parent = cmd_line_embedder.substitute_command_line(
364 parent_job.executable.src_uri + " " + parent_job.arguments,
365 parent_job.cmdvals,
366 parent_name,
367 )
368 dependency_map.get(pseudo_file_name).append(pseudo_file_parent)
370 successors = self.bps_workflow.successors(job_name)
371 if next(successors, None) is None:
372 self.dag_end_tasks.add(gwjob.label)
373 return dependency_map
375 def split_map_over_tasks(self, raw_dependency_map):
376 """Group nodes performing same operations into tasks. For each task
377 define inputs and its dependencies.
379 This is a structure to be filled out in function taskname: ::
381 dependencies = [
382 {
383 "name": "filename0",
384 "dependencies": [
385 {
386 "task": "task1",
387 "inputname":"filename0",
388 "available": False"
389 },
390 ],
391 "submitted": False
392 }
393 ]
395 Parameters
396 ----------
397 raw_dependency_map : `dict`
398 Pairs node-list of directly connected upstream nodes
400 Returns
401 -------
402 tasks_dependency_map : `dict` [`str`, `list`]
403 Dict of tasks/correspondent dependencies
404 """
405 tasks_dependency_map = {}
406 for job, dependency in raw_dependency_map.items():
407 task_name = self.define_task_name(self.jobs_steps[job])
408 tasks_dependency_map.setdefault(task_name, {})[job] = self.split_dependencies_by_tasks(dependency)
409 self.tasks_steps[task_name] = self.jobs_steps[job]
410 return tasks_dependency_map
412 def get_task_by_job_name(self, job_name):
413 return job_name.split("_")[1] if len(job_name.split("_")) > 1 else job_name
415 def split_dependencies_by_tasks(self, dependencies):
416 """Group the list of dependencies by tasks where dependencies comes
417 from.
419 Parameters
420 ----------
421 dependencies : `list` [`dicts`]
422 Each dictionary in the list contains information about
423 dependency: task,inputname,available
425 Returns
426 -------
427 dependencies_by_tasks : `dict` [`str`, `list`]
428 Dict of tasks/dependency files comes from that task.
429 """
430 dependencies_by_tasks = {}
431 for dependency in dependencies:
432 dependencies_by_tasks.setdefault(self.define_task_name(self.jobs_steps[dependency]), []).append(
433 dependency
434 )
435 return dependencies_by_tasks
437 def get_input_file(self, job_name):
438 """Extract the quantum graph file needed for a job.
440 Parameters
441 ----------
442 job_name: `str`
443 The name of the node in workflow DAG.
445 Returns
446 -------
447 quantum graph file name
448 """
449 return next(iter(self.bps_workflow.nodes.get(job_name).get("inputs")))