Coverage for python/lsst/ctrl/bps/panda/idds_tasks.py: 26%
236 statements
« prev ^ index » next coverage.py v7.2.1, created at 2023-03-12 21:11 -0700
« prev ^ index » next coverage.py v7.2.1, created at 2023-03-12 21:11 -0700
1# This file is part of ctrl_bps_panda.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
21import os.path
22from dataclasses import dataclass
24from lsst.ctrl.bps import GenericWorkflow, GenericWorkflowJob
25from lsst.ctrl.bps.panda.cmd_line_embedder import CommandLineEmbedder
28@dataclass
29class FileDescriptor:
30 """Holds parameters needed to define a file used by a job of task"""
32 name: str = None
33 """Name of the file"""
34 distribution_url: str = None
35 """The location (URL) where this file to be distributed to the edge node"""
36 submission_url: str = None
37 """Path to file on the submission node"""
38 direct_IO: bool = False
39 """Is the file to be used remotely"""
40 delivered: bool = False
41 """Is is this file has been delivered to the distribution endpoint"""
44@dataclass
45class RubinTask:
46 """Holds parameters needed to define a PanDA task"""
48 name: str = None
49 """Name of the task"""
50 step: str = None
51 """Processing step"""
52 queue: str = None
53 """Computing queue where the task to be submitted"""
54 executable: str = None
55 """The task command line to be executed"""
56 max_walltime: int = None
57 """Maximum allowed walltime in seconds"""
58 max_attempt: int = None
59 """Maximum number of jobs attempts in a task"""
60 max_rss: int = None
61 """Maximum size of RAM to be used by a job"""
62 cloud: str = None
63 """Computing cloud in CRIC registry where the task should
64 be submitted to"""
65 site: str = None
66 """Computing site in CRIC registry where the task should
67 be submitted to"""
68 core_count: int = 1
69 """Number of CPU cores to be used by a job"""
70 working_group: str = None
71 """Group for accounting"""
72 priority: int = 0
73 """Task priority"""
74 processing_type: str = None
75 """Task processing type such as simulation, reconstruction"""
76 task_type: str = None
77 """The type of the task, such as production, analysis"""
78 prod_source_label: str = "managed"
79 """Label to manage production jobs and test jobs. Its value
80 can be 'managed' and 'test'"""
81 vo: str = "Rubin"
82 """Virtual organization name"""
83 jobs_pseudo_inputs: list = None
84 """Name of preudo input to be used by task and defining jobs"""
85 files_used_by_task: list = None
86 """List of physical files necessary for running a task"""
87 dependencies: list = None
88 """List of upstream tasks and its pseudoinput parameters
89 needed for running jobs in this task"""
90 is_final: bool = False
91 """Is this a finalization task"""
92 is_dag_end: bool = False
93 """Is this task is on the end of the DAG"""
96class IDDSWorkflowGenerator:
97 """
98 Class generates a iDDS workflow to be submitted into PanDA.
99 Workflow includes definition of each task and
100 definition of dependencies for each task input.
102 Parameters
103 ----------
104 bps_workflow : `lsst.ctrl.bps.GenericWorkflow`
105 The generic workflow constructed by BPS system
106 config : `lsst.ctrl.bps.BpsConfig`
107 BPS configuration that includes necessary submit/runtime information,
108 sufficiently defined in YAML file supplied in `submit` command
109 """
111 def __init__(self, bps_workflow, config):
112 self.bps_workflow = bps_workflow
113 self.bps_config = config
114 self.jobs_steps = {}
115 self.tasks_steps = {}
116 self.tasks_cmd_lines = {}
117 self.dag_end_tasks = set()
118 self.number_of_retries = {}
119 _, self.max_walltime = config.search("maxWalltime", opt={"default": 90000})
120 _, self.max_jobs_per_task = config.search("maxJobsPerTask", opt={"default": 30000})
122 def define_task_name(self, step):
123 """Return task name as a combination of the workflow name (unique
124 across workflows) and processing step name.
126 Parameters
127 ----------
128 step : `str`
129 Processing step name
131 Returns
132 -------
133 Task name : `str`
134 Computed task name
135 """
136 return self.bps_config["workflowName"] + "_" + step
138 def fill_input_files(self, task_name):
139 files = []
140 jobs = [
141 job_name
142 for job_name in self.bps_workflow
143 if self.bps_workflow.get_job(job_name).label == self.tasks_steps[task_name]
144 ]
145 for job in jobs:
146 for gwfile in self.bps_workflow.get_job_inputs(job, transfer_only=True):
147 file = FileDescriptor()
148 file.name = gwfile.name
149 file.submission_url = gwfile.src_uri
150 file.distribution_url = os.path.join(
151 self.bps_config["fileDistributionEndPoint"], os.path.basename(gwfile.src_uri)
152 )
153 file.direct_IO = gwfile.job_access_remote
154 files.append(file)
155 return files
157 def define_tasks(self):
158 """Provide tasks definition sufficient for PanDA submission
160 Returns
161 -------
162 tasks : `list` [`RubinTask`]
163 Tasks filled with parameters provided in workflow configuration
164 and generated pipeline.
165 """
166 tasks = []
167 raw_dependency_map = self.create_raw_jobs_dependency_map()
168 tasks_dependency_map = self.split_map_over_tasks(raw_dependency_map)
169 tasks_dependency_map_chunked = self.split_tasks_into_chunks(tasks_dependency_map)
170 for task_name, jobs in tasks_dependency_map_chunked.items():
171 task = RubinTask()
172 task.step = task_name
173 task.name = task.step
174 picked_job_name = next(
175 filter(
176 lambda job_name: self.bps_workflow.get_job(job_name).label == self.tasks_steps[task_name],
177 self.bps_workflow,
178 )
179 )
180 bps_node = self.bps_workflow.get_job(picked_job_name)
181 task.queue = bps_node.queue
182 task.cloud = bps_node.compute_cloud
183 task.site = bps_node.compute_site
184 task.core_count = bps_node.request_cpus
185 task.priority = bps_node.priority
186 task.working_group = bps_node.accounting_group
187 task.jobs_pseudo_inputs = list(jobs)
188 if bps_node.number_of_retries:
189 task.max_attempt = bps_node.number_of_retries
190 else:
191 task.max_attempt = self.number_of_retries.get(task_name, 5)
192 if bps_node.request_walltime:
193 task.max_walltime = bps_node.request_walltime
194 else:
195 task.max_walltime = self.max_walltime
196 task.max_rss = bps_node.request_memory
197 task.executable = self.tasks_cmd_lines[task_name]
198 task.files_used_by_task = self.fill_input_files(task_name)
199 task.is_final = False
200 task.is_dag_end = self.tasks_steps[task_name] in self.dag_end_tasks
201 tasks.append(task)
202 self.add_dependencies(tasks, tasks_dependency_map_chunked)
203 final_task = self.get_final_task()
204 tasks.append(final_task)
205 return tasks
207 def split_tasks_into_chunks(self, tasks_dependency_map):
208 """If a task is going to contain jobs whose number is above a
209 threshold this function splits such a large tasks into chunks.
211 Parameters
212 ----------
213 tasks_dependency_map : `dict`
214 dependencies dictionary with task name in key and jobs
215 dependencies in values. The latter dict has a job input
216 parameters (PanDA pseudo file name) in the key and dict of
217 pairs (upstream task name) - (its PanDA pseudo file name)
218 which defines dependency for a job.
220 Returns
221 -------
222 tasks_dependency_map : `dict`
223 dependencies dictionary with chunked tasks where its needed.
224 """
225 tasks_dependency_map_chunked = {}
226 tasks_chunked = {}
228 """At this step only tasks names are updated to distribute
229 tasks over chunks
230 """
231 for task_name, dependencies in tasks_dependency_map.items():
232 n_jobs_in_task = len(dependencies)
233 if n_jobs_in_task > self.max_jobs_per_task:
234 n_chunks = -(-n_jobs_in_task // self.max_jobs_per_task)
235 for pseudo_input, dependency in dependencies.items():
236 chunk_id = hash(pseudo_input) % n_chunks
237 task_name_chunked = self.get_task_name_with_chunk(task_name, chunk_id)
238 tasks_dependency_map_chunked.setdefault(task_name_chunked, {})[pseudo_input] = dependency
239 self.tasks_steps[task_name_chunked] = self.tasks_steps[task_name]
240 self.tasks_cmd_lines[task_name_chunked] = self.tasks_cmd_lines[task_name]
241 tasks_chunked[task_name] = n_chunks
242 else:
243 tasks_dependency_map_chunked[task_name] = dependencies
245 """This block propagates chunking over upstream dependencies
246 records.
247 """
248 tasks_dependency_map_chunked_updated_dep = {}
249 for task, dependencies in tasks_dependency_map_chunked.items():
250 for pseudo_input, dependency in dependencies.items():
251 updated_dependencies = {}
252 for upstream_task_name, pseudo_inputs in dependency.items():
253 if upstream_task_name in tasks_chunked:
254 for upstream_pseudo_input in pseudo_inputs:
255 chunk_id = hash(upstream_pseudo_input) % tasks_chunked[upstream_task_name]
256 task_name_chunked = self.get_task_name_with_chunk(upstream_task_name, chunk_id)
257 chunked_task_name = task_name_chunked
258 updated_dependencies.setdefault(chunked_task_name, []).append(
259 upstream_pseudo_input
260 )
261 else:
262 updated_dependencies.setdefault(upstream_task_name, []).extend(pseudo_inputs)
263 tasks_dependency_map_chunked_updated_dep.setdefault(task, {}).setdefault(
264 pseudo_input, {}
265 ).update(updated_dependencies)
266 return tasks_dependency_map_chunked_updated_dep
268 def get_task_name_with_chunk(self, task_name, chunk_id):
269 """Concatenates file name and chunk ID
271 Parameters
272 ----------
273 task_name : `str`
274 The name of the task
276 chunk_id : `int`
277 ID of the chunk
279 Returns
280 -------
281 task_name : `str`
282 Concatenated task name
283 """
284 return f"{task_name}_chunk_{chunk_id}"
286 def get_final_task(self):
287 """If final job exists in generic workflow, create DAG final task
289 Returns
290 -------
291 task : `RubinTask`
292 The final task for a workflow
293 """
294 final_job = self.bps_workflow.get_final()
295 if final_job and isinstance(final_job, GenericWorkflowJob):
296 task = RubinTask()
297 bash_file = FileDescriptor()
298 bash_file.submission_url = final_job.executable.src_uri
299 bash_file.distribution_url = os.path.join(
300 self.bps_config["fileDistributionEndPoint"], final_job.executable.name
301 )
302 task.executable = f"bash ./{final_job.executable.name} {final_job.arguments}"
304 task.step = final_job.label
305 task.name = self.define_task_name(final_job.label)
306 task.queue = final_job.queue
307 task.cloud = final_job.compute_cloud
308 task.site = final_job.compute_site
309 task.core_count = final_job.request_cpus
310 task.priority = final_job.priority
311 task.working_group = final_job.accounting_group
312 task.jobs_pseudo_inputs = []
314 # This string implements empty pattern for dependencies
315 task.dependencies = [
316 {"name": "pure_pseudoinput+qgraphNodeId:+qgraphId:", "submitted": False, "dependencies": []}
317 ]
319 if final_job.number_of_retries:
320 task.max_attempt = final_job.number_of_retries
321 else:
322 task.max_attempt = self.number_of_retries.get(task.name, 5)
323 if final_job.request_walltime:
324 task.max_walltime = final_job.request_walltime
325 else:
326 task.max_walltime = self.max_walltime
327 task.max_rss = final_job.request_memory
328 task.files_used_by_task = [bash_file]
329 task.is_final = True
330 task.is_dag_end = False
331 return task
332 elif final_job and isinstance(final_job, GenericWorkflow):
333 raise NotImplementedError("PanDA plugin does not support a workflow as the final job")
334 elif final_job:
335 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final_job)})")
337 def add_dependencies(self, tasks, tasks_dependency_map):
338 """Add the dependency list to a task definition. This list defines all
339 inputs of a task and how that inputs depend on upstream processing
340 steps
342 Parameters
343 ----------
344 tasks : `list` [`RubinTask`]
345 Tasks to be filled with dependency information
347 tasks_dependency_map : `dict` of dependencies dictionary
349 Returns
350 -------
351 Method modifies items in the tasks list provided as an argument
352 """
353 for task in tasks:
354 jobs = tasks_dependency_map[task.step]
355 task.dependencies = []
356 for job, job_dependency in jobs.items():
357 job_dep = {
358 "name": job,
359 "submitted": False,
360 }
361 input_files_dependencies = []
362 for taskname, files in job_dependency.items():
363 for file in files:
364 input_files_dependencies.append(
365 {"task": taskname, "inputname": file, "available": False}
366 )
367 job_dep["dependencies"] = input_files_dependencies
368 task.dependencies.append(job_dep)
370 def create_raw_jobs_dependency_map(self):
371 """Compute the DAG nodes dependency map (node - list of nodes) for each
372 node in the workflow DAG
374 Returns
375 -------
376 dependency_map : `dict` of `node-dependencies` pairs.
377 For each node in workflow DAG computed its dependencies (other
378 nodes).
379 """
381 dependency_map = {}
382 cmd_line_embedder = CommandLineEmbedder(self.bps_config)
384 for job_name in self.bps_workflow:
385 gwjob = self.bps_workflow.get_job(job_name)
386 cmd_line, pseudo_file_name = cmd_line_embedder.substitute_command_line(
387 gwjob.executable.src_uri + " " + gwjob.arguments, gwjob.cmdvals, job_name
388 )
389 if len(pseudo_file_name) > 4000:
390 raise NameError(
391 "job pseudo input file name contains more than 4000 symbols. Can not proceed."
392 )
394 task_name_for_label = self.define_task_name(gwjob.label)
395 self.tasks_cmd_lines[task_name_for_label] = cmd_line
396 self.jobs_steps[pseudo_file_name] = gwjob.label
397 if gwjob.number_of_retries:
398 self.number_of_retries[task_name_for_label] = gwjob.number_of_retries
399 dependency_map[pseudo_file_name] = []
400 predecessors = self.bps_workflow.predecessors(job_name)
401 for parent_name in predecessors:
402 parent_job = self.bps_workflow.get_job(parent_name)
403 cmd_line_parent, pseudo_file_parent = cmd_line_embedder.substitute_command_line(
404 parent_job.executable.src_uri + " " + parent_job.arguments,
405 parent_job.cmdvals,
406 parent_name,
407 )
408 dependency_map.get(pseudo_file_name).append(pseudo_file_parent)
410 successors = self.bps_workflow.successors(job_name)
411 if next(successors, None) is None:
412 self.dag_end_tasks.add(gwjob.label)
413 return dependency_map
415 def split_map_over_tasks(self, raw_dependency_map):
416 """Group nodes performing same operations into tasks. For each task
417 define inputs and its dependencies.
419 This is a structure to be filled out in function taskname: ::
421 dependencies = [
422 {
423 "name": "filename0",
424 "dependencies": [
425 {
426 "task": "task1",
427 "inputname":"filename0",
428 "available": False"
429 },
430 ],
431 "submitted": False
432 }
433 ]
435 Parameters
436 ----------
437 raw_dependency_map : `dict`
438 Pairs node-list of directly connected upstream nodes
440 Returns
441 -------
442 tasks_dependency_map : `dict` [`str`, `list`]
443 Dict of tasks/correspondent dependencies
444 """
445 tasks_dependency_map = {}
446 for job, dependency in raw_dependency_map.items():
447 task_name = self.define_task_name(self.jobs_steps[job])
448 tasks_dependency_map.setdefault(task_name, {})[job] = self.split_dependencies_by_tasks(dependency)
449 self.tasks_steps[task_name] = self.jobs_steps[job]
450 return tasks_dependency_map
452 def get_task_by_job_name(self, job_name):
453 return job_name.split("_")[1] if len(job_name.split("_")) > 1 else job_name
455 def split_dependencies_by_tasks(self, dependencies):
456 """Group the list of dependencies by tasks where dependencies comes
457 from.
459 Parameters
460 ----------
461 dependencies : `list` [`dicts`]
462 Each dictionary in the list contains information about
463 dependency: task,inputname,available
465 Returns
466 -------
467 dependencies_by_tasks : `dict` [`str`, `list`]
468 Dict of tasks/dependency files comes from that task.
469 """
470 dependencies_by_tasks = {}
471 for dependency in dependencies:
472 dependencies_by_tasks.setdefault(self.define_task_name(self.jobs_steps[dependency]), []).append(
473 dependency
474 )
475 return dependencies_by_tasks
477 def get_input_file(self, job_name):
478 """Extract the quantum graph file needed for a job.
480 Parameters
481 ----------
482 job_name: `str`
483 The name of the node in workflow DAG.
485 Returns
486 -------
487 quantum graph file name
488 """
489 return next(iter(self.bps_workflow.nodes.get(job_name).get("inputs")))