Coverage for python/lsst/ctrl/bps/panda/idds_tasks.py: 26%
239 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-12 10:21 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-12 10:21 +0000
1# This file is part of ctrl_bps_panda.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
21import logging
22import os.path
23from dataclasses import dataclass
25from lsst.ctrl.bps import GenericWorkflow, GenericWorkflowJob
26from lsst.ctrl.bps.panda.cmd_line_embedder import CommandLineEmbedder
28_LOG = logging.getLogger(__name__)
31@dataclass
32class FileDescriptor:
33 """Holds parameters needed to define a file used by a job of task"""
35 name: str = None
36 """Name of the file"""
37 distribution_url: str = None
38 """The location (URL) where this file to be distributed to the edge node"""
39 submission_url: str = None
40 """Path to file on the submission node"""
41 direct_IO: bool = False
42 """Is the file to be used remotely"""
43 delivered: bool = False
44 """Is is this file has been delivered to the distribution endpoint"""
47@dataclass
48class RubinTask:
49 """Holds parameters needed to define a PanDA task"""
51 name: str = None
52 """Name of the task"""
53 step: str = None
54 """Processing step"""
55 queue: str = None
56 """Computing queue where the task to be submitted"""
57 executable: str = None
58 """The task command line to be executed"""
59 max_walltime: int = None
60 """Maximum allowed walltime in seconds"""
61 max_attempt: int = None
62 """Maximum number of jobs attempts in a task"""
63 max_rss: int = None
64 """Maximum size of RAM to be used by a job"""
65 cloud: str = None
66 """Computing cloud in CRIC registry where the task should
67 be submitted to"""
68 site: str = None
69 """Computing site in CRIC registry where the task should
70 be submitted to"""
71 core_count: int = 1
72 """Number of CPU cores to be used by a job"""
73 working_group: str = None
74 """Group for accounting"""
75 priority: int = 0
76 """Task priority"""
77 processing_type: str = None
78 """Task processing type such as simulation, reconstruction"""
79 task_type: str = None
80 """The type of the task, such as production, analysis"""
81 prod_source_label: str = "managed"
82 """Label to manage production jobs and test jobs. Its value
83 can be 'managed' and 'test'"""
84 vo: str = "Rubin"
85 """Virtual organization name"""
86 jobs_pseudo_inputs: list = None
87 """Name of preudo input to be used by task and defining jobs"""
88 files_used_by_task: list = None
89 """List of physical files necessary for running a task"""
90 dependencies: list = None
91 """List of upstream tasks and its pseudoinput parameters
92 needed for running jobs in this task"""
93 is_final: bool = False
94 """Is this a finalization task"""
95 is_dag_end: bool = False
96 """Is this task is on the end of the DAG"""
99class IDDSWorkflowGenerator:
100 """
101 Class generates a iDDS workflow to be submitted into PanDA.
102 Workflow includes definition of each task and
103 definition of dependencies for each task input.
105 Parameters
106 ----------
107 bps_workflow : `lsst.ctrl.bps.GenericWorkflow`
108 The generic workflow constructed by BPS system
109 config : `lsst.ctrl.bps.BpsConfig`
110 BPS configuration that includes necessary submit/runtime information,
111 sufficiently defined in YAML file supplied in `submit` command
112 """
114 def __init__(self, bps_workflow, config):
115 self.bps_workflow = bps_workflow
116 self.bps_config = config
117 self.jobs_steps = {}
118 self.tasks_steps = {}
119 self.tasks_cmd_lines = {}
120 self.dag_end_tasks = set()
121 self.number_of_retries = {}
122 _, self.max_walltime = config.search("maxWalltime", opt={"default": 90000})
123 _, self.max_jobs_per_task = config.search("maxJobsPerTask", opt={"default": 30000})
125 def define_task_name(self, step):
126 """Return task name as a combination of the workflow name (unique
127 across workflows) and processing step name.
129 Parameters
130 ----------
131 step : `str`
132 Processing step name
134 Returns
135 -------
136 Task name : `str`
137 Computed task name
138 """
139 return self.bps_config["workflowName"] + "_" + step
141 def fill_input_files(self, task_name):
142 files = []
143 jobs = [
144 job_name
145 for job_name in self.bps_workflow
146 if self.bps_workflow.get_job(job_name).label == self.tasks_steps[task_name]
147 ]
148 for job in jobs:
149 for gwfile in self.bps_workflow.get_job_inputs(job, transfer_only=True):
150 file = FileDescriptor()
151 file.name = gwfile.name
152 file.submission_url = gwfile.src_uri
153 file.distribution_url = os.path.join(
154 self.bps_config["fileDistributionEndPoint"], os.path.basename(gwfile.src_uri)
155 )
156 file.direct_IO = gwfile.job_access_remote
157 files.append(file)
158 return files
160 def define_tasks(self):
161 """Provide tasks definition sufficient for PanDA submission
163 Returns
164 -------
165 tasks : `list` [`RubinTask`]
166 Tasks filled with parameters provided in workflow configuration
167 and generated pipeline.
168 """
169 tasks = []
170 raw_dependency_map = self.create_raw_jobs_dependency_map()
171 tasks_dependency_map = self.split_map_over_tasks(raw_dependency_map)
172 tasks_dependency_map_chunked = self.split_tasks_into_chunks(tasks_dependency_map)
173 for task_name, jobs in tasks_dependency_map_chunked.items():
174 task = RubinTask()
175 task.step = task_name
176 task.name = task.step
177 picked_job_name = next(
178 filter(
179 lambda job_name: self.bps_workflow.get_job(job_name).label == self.tasks_steps[task_name],
180 self.bps_workflow,
181 )
182 )
183 bps_node = self.bps_workflow.get_job(picked_job_name)
184 task.queue = bps_node.queue
185 task.cloud = bps_node.compute_cloud
186 task.site = bps_node.compute_site
187 task.core_count = bps_node.request_cpus
188 task.priority = bps_node.priority
189 task.working_group = bps_node.accounting_group
190 task.jobs_pseudo_inputs = list(jobs)
191 if bps_node.number_of_retries:
192 task.max_attempt = bps_node.number_of_retries
193 else:
194 task.max_attempt = self.number_of_retries.get(task_name, 3)
195 if bps_node.request_walltime:
196 task.max_walltime = bps_node.request_walltime
197 else:
198 task.max_walltime = self.max_walltime
199 task.max_rss = bps_node.request_memory
200 task.executable = self.tasks_cmd_lines[task_name]
201 task.files_used_by_task = self.fill_input_files(task_name)
202 task.is_final = False
203 task.is_dag_end = self.tasks_steps[task_name] in self.dag_end_tasks
204 tasks.append(task)
205 self.add_dependencies(tasks, tasks_dependency_map_chunked)
206 final_task = self.get_final_task()
207 tasks.append(final_task)
208 return tasks
210 def split_tasks_into_chunks(self, tasks_dependency_map):
211 """If a task is going to contain jobs whose number is above a
212 threshold this function splits such a large tasks into chunks.
214 Parameters
215 ----------
216 tasks_dependency_map : `dict`
217 dependencies dictionary with task name in key and jobs
218 dependencies in values. The latter dict has a job input
219 parameters (PanDA pseudo file name) in the key and dict of
220 pairs (upstream task name) - (its PanDA pseudo file name)
221 which defines dependency for a job.
223 Returns
224 -------
225 tasks_dependency_map : `dict`
226 dependencies dictionary with chunked tasks where its needed.
227 """
228 tasks_dependency_map_chunked = {}
229 tasks_chunked = {}
231 """At this step only tasks names are updated to distribute
232 tasks over chunks
233 """
234 for task_name, dependencies in tasks_dependency_map.items():
235 n_jobs_in_task = len(dependencies)
236 if n_jobs_in_task > self.max_jobs_per_task:
237 n_chunks = -(-n_jobs_in_task // self.max_jobs_per_task)
238 for pseudo_input, dependency in dependencies.items():
239 chunk_id = hash(pseudo_input) % n_chunks
240 task_name_chunked = self.get_task_name_with_chunk(task_name, chunk_id)
241 tasks_dependency_map_chunked.setdefault(task_name_chunked, {})[pseudo_input] = dependency
242 self.tasks_steps[task_name_chunked] = self.tasks_steps[task_name]
243 self.tasks_cmd_lines[task_name_chunked] = self.tasks_cmd_lines[task_name]
244 tasks_chunked[task_name] = n_chunks
245 else:
246 tasks_dependency_map_chunked[task_name] = dependencies
248 """This block propagates chunking over upstream dependencies
249 records.
250 """
251 tasks_dependency_map_chunked_updated_dep = {}
252 for task, dependencies in tasks_dependency_map_chunked.items():
253 for pseudo_input, dependency in dependencies.items():
254 updated_dependencies = {}
255 for upstream_task_name, pseudo_inputs in dependency.items():
256 if upstream_task_name in tasks_chunked:
257 for upstream_pseudo_input in pseudo_inputs:
258 chunk_id = hash(upstream_pseudo_input) % tasks_chunked[upstream_task_name]
259 task_name_chunked = self.get_task_name_with_chunk(upstream_task_name, chunk_id)
260 chunked_task_name = task_name_chunked
261 updated_dependencies.setdefault(chunked_task_name, []).append(
262 upstream_pseudo_input
263 )
264 else:
265 updated_dependencies.setdefault(upstream_task_name, []).extend(pseudo_inputs)
266 tasks_dependency_map_chunked_updated_dep.setdefault(task, {}).setdefault(
267 pseudo_input, {}
268 ).update(updated_dependencies)
269 return tasks_dependency_map_chunked_updated_dep
271 def get_task_name_with_chunk(self, task_name, chunk_id):
272 """Concatenates file name and chunk ID
274 Parameters
275 ----------
276 task_name : `str`
277 The name of the task
279 chunk_id : `int`
280 ID of the chunk
282 Returns
283 -------
284 task_name : `str`
285 Concatenated task name
286 """
287 return f"{task_name}_chunk_{chunk_id}"
289 def get_final_task(self):
290 """If final job exists in generic workflow, create DAG final task
292 Returns
293 -------
294 task : `RubinTask`
295 The final task for a workflow
296 """
297 final_job = self.bps_workflow.get_final()
298 if final_job and isinstance(final_job, GenericWorkflowJob):
299 task = RubinTask()
300 bash_file = FileDescriptor()
301 bash_file.submission_url = final_job.executable.src_uri
302 bash_file.distribution_url = os.path.join(
303 self.bps_config["fileDistributionEndPoint"], final_job.executable.name
304 )
305 task.executable = f"bash ./{final_job.executable.name} {final_job.arguments}"
307 task.step = final_job.label
308 task.name = self.define_task_name(final_job.label)
309 task.queue = final_job.queue
310 task.cloud = final_job.compute_cloud
311 task.site = final_job.compute_site
312 task.core_count = final_job.request_cpus
313 task.priority = final_job.priority
314 task.working_group = final_job.accounting_group
315 task.jobs_pseudo_inputs = []
317 # This string implements empty pattern for dependencies
318 task.dependencies = [
319 {"name": "pure_pseudoinput+qgraphNodeId:+qgraphId:", "submitted": False, "dependencies": []}
320 ]
322 if final_job.number_of_retries:
323 task.max_attempt = final_job.number_of_retries
324 else:
325 task.max_attempt = self.number_of_retries.get(task.name, 3)
326 if final_job.request_walltime:
327 task.max_walltime = final_job.request_walltime
328 else:
329 task.max_walltime = self.max_walltime
330 task.max_rss = final_job.request_memory
331 task.files_used_by_task = [bash_file]
332 task.is_final = True
333 task.is_dag_end = False
334 return task
335 elif final_job and isinstance(final_job, GenericWorkflow):
336 raise NotImplementedError("PanDA plugin does not support a workflow as the final job")
337 elif final_job:
338 return TypeError(f"Invalid type for GenericWorkflow.get_final() results ({type(final_job)})")
340 def add_dependencies(self, tasks, tasks_dependency_map):
341 """Add the dependency list to a task definition. This list defines all
342 inputs of a task and how that inputs depend on upstream processing
343 steps
345 Parameters
346 ----------
347 tasks : `list` [`RubinTask`]
348 Tasks to be filled with dependency information
350 tasks_dependency_map : `dict` of dependencies dictionary
352 Returns
353 -------
354 Method modifies items in the tasks list provided as an argument
355 """
356 for task in tasks:
357 jobs = tasks_dependency_map[task.step]
358 task.dependencies = []
359 for job, job_dependency in jobs.items():
360 job_dep = {
361 "name": job,
362 "submitted": False,
363 }
364 input_files_dependencies = []
365 for taskname, files in job_dependency.items():
366 for file in files:
367 input_files_dependencies.append(
368 {"task": taskname, "inputname": file, "available": False}
369 )
370 job_dep["dependencies"] = input_files_dependencies
371 task.dependencies.append(job_dep)
373 def create_raw_jobs_dependency_map(self):
374 """Compute the DAG nodes dependency map (node - list of nodes) for each
375 node in the workflow DAG
377 Returns
378 -------
379 dependency_map : `dict` of `node-dependencies` pairs.
380 For each node in workflow DAG computed its dependencies (other
381 nodes).
382 """
384 dependency_map = {}
385 cmd_line_embedder = CommandLineEmbedder(self.bps_config)
387 for job_name in self.bps_workflow:
388 gwjob = self.bps_workflow.get_job(job_name)
389 cmd_line, pseudo_file_name = cmd_line_embedder.substitute_command_line(
390 gwjob.executable.src_uri + " " + gwjob.arguments, gwjob.cmdvals, job_name
391 )
392 if len(pseudo_file_name) > 4000:
393 _LOG.error(f"pseudo_file_name: {pseudo_file_name}")
394 raise NameError(
395 "job pseudo input file name contains more than 4000 symbols. Can not proceed."
396 )
398 task_name_for_label = self.define_task_name(gwjob.label)
399 self.tasks_cmd_lines[task_name_for_label] = cmd_line
400 self.jobs_steps[pseudo_file_name] = gwjob.label
401 if gwjob.number_of_retries:
402 self.number_of_retries[task_name_for_label] = gwjob.number_of_retries
403 dependency_map[pseudo_file_name] = []
404 predecessors = self.bps_workflow.predecessors(job_name)
405 for parent_name in predecessors:
406 parent_job = self.bps_workflow.get_job(parent_name)
407 cmd_line_parent, pseudo_file_parent = cmd_line_embedder.substitute_command_line(
408 parent_job.executable.src_uri + " " + parent_job.arguments,
409 parent_job.cmdvals,
410 parent_name,
411 )
412 dependency_map.get(pseudo_file_name).append(pseudo_file_parent)
414 successors = self.bps_workflow.successors(job_name)
415 if next(successors, None) is None:
416 self.dag_end_tasks.add(gwjob.label)
417 return dependency_map
419 def split_map_over_tasks(self, raw_dependency_map):
420 """Group nodes performing same operations into tasks. For each task
421 define inputs and its dependencies.
423 This is a structure to be filled out in function taskname: ::
425 dependencies = [
426 {
427 "name": "filename0",
428 "dependencies": [
429 {
430 "task": "task1",
431 "inputname":"filename0",
432 "available": False"
433 },
434 ],
435 "submitted": False
436 }
437 ]
439 Parameters
440 ----------
441 raw_dependency_map : `dict`
442 Pairs node-list of directly connected upstream nodes
444 Returns
445 -------
446 tasks_dependency_map : `dict` [`str`, `list`]
447 Dict of tasks/correspondent dependencies
448 """
449 tasks_dependency_map = {}
450 for job, dependency in raw_dependency_map.items():
451 task_name = self.define_task_name(self.jobs_steps[job])
452 tasks_dependency_map.setdefault(task_name, {})[job] = self.split_dependencies_by_tasks(dependency)
453 self.tasks_steps[task_name] = self.jobs_steps[job]
454 return tasks_dependency_map
456 def get_task_by_job_name(self, job_name):
457 return job_name.split("_")[1] if len(job_name.split("_")) > 1 else job_name
459 def split_dependencies_by_tasks(self, dependencies):
460 """Group the list of dependencies by tasks where dependencies comes
461 from.
463 Parameters
464 ----------
465 dependencies : `list` [`dicts`]
466 Each dictionary in the list contains information about
467 dependency: task,inputname,available
469 Returns
470 -------
471 dependencies_by_tasks : `dict` [`str`, `list`]
472 Dict of tasks/dependency files comes from that task.
473 """
474 dependencies_by_tasks = {}
475 for dependency in dependencies:
476 dependencies_by_tasks.setdefault(self.define_task_name(self.jobs_steps[dependency]), []).append(
477 dependency
478 )
479 return dependencies_by_tasks
481 def get_input_file(self, job_name):
482 """Extract the quantum graph file needed for a job.
484 Parameters
485 ----------
486 job_name: `str`
487 The name of the node in workflow DAG.
489 Returns
490 -------
491 quantum graph file name
492 """
493 return next(iter(self.bps_workflow.nodes.get(job_name).get("inputs")))