Coverage for python/lsst/ctrl/bps/wms_service.py: 86%
105 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-03-28 03:07 -0700
« prev ^ index » next coverage.py v7.4.4, created at 2024-03-28 03:07 -0700
1# This file is part of ctrl_bps.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <https://www.gnu.org/licenses/>.
28"""Base classes for working with a specific WMS."""
31__all__ = [
32 "BaseWmsService",
33 "BaseWmsWorkflow",
34 "WmsJobReport",
35 "WmsRunReport",
36 "WmsStates",
37]
40import dataclasses
41import logging
42from abc import ABCMeta
43from enum import Enum
45_LOG = logging.getLogger(__name__)
48class WmsStates(Enum):
49 """Run and job states."""
51 UNKNOWN = 0
52 """Can't determine state."""
54 MISFIT = 1
55 """Determined state, but doesn't fit other states."""
57 UNREADY = 2
58 """Still waiting for parents to finish."""
60 READY = 3
61 """All of its parents have finished successfully."""
63 PENDING = 4
64 """Ready to run, visible in batch queue."""
66 RUNNING = 5
67 """Currently running."""
69 DELETED = 6
70 """In the process of being deleted or already deleted."""
72 HELD = 7
73 """In a hold state."""
75 SUCCEEDED = 8
76 """Have completed with success status."""
78 FAILED = 9
79 """Have completed with non-success status."""
81 PRUNED = 10
82 """At least one of the parents failed or can't be run."""
85@dataclasses.dataclass(slots=True)
86class WmsJobReport:
87 """WMS job information to be included in detailed report output."""
89 wms_id: str
90 """Job id assigned by the workflow management system."""
92 name: str
93 """A name assigned automatically by BPS."""
95 label: str
96 """A user-facing label for a job. Multiple jobs can have the same label."""
98 state: WmsStates
99 """Job's current execution state."""
102@dataclasses.dataclass(slots=True)
103class WmsRunReport:
104 """WMS run information to be included in detailed report output."""
106 wms_id: str = None
107 """Id assigned to the run by the WMS.
108 """
110 global_wms_id: str = None
111 """Global run identification number.
113 Only applicable in the context of a WMS using distributed job queues
114 (e.g., HTCondor).
115 """
117 path: str = None
118 """Path to the submit directory."""
120 label: str = None
121 """Run's label."""
123 run: str = None
124 """Run's name."""
126 project: str = None
127 """Name of the project run belongs to."""
129 campaign: str = None
130 """Name of the campaign the run belongs to."""
132 payload: str = None
133 """Name of the payload."""
135 operator: str = None
136 """Username of the operator who submitted the run."""
138 run_summary: str = None
139 """Job counts per label."""
141 state: WmsStates = None
142 """Run's execution state."""
144 jobs: list[WmsJobReport] = None
145 """Information about individual jobs in the run."""
147 total_number_jobs: int = None
148 """Total number of jobs in the run."""
150 job_state_counts: dict[WmsStates, int] = None
151 """Job counts per state."""
153 job_summary: dict[str, dict[WmsStates, int]] = None
154 """Job counts per label and per state."""
156 exit_code_summary: dict[str, list[int]] = None
157 """Summary of non-zero exit codes per job label available through the WMS.
159 Currently behavior for jobs that were canceled, held, etc. are plugin
160 dependent.
161 """
164class BaseWmsService:
165 """Interface for interactions with a specific WMS.
167 Parameters
168 ----------
169 config : `lsst.ctrl.bps.BpsConfig`
170 Configuration needed by the WMS service.
171 """
173 def __init__(self, config):
174 self.config = config
176 def prepare(self, config, generic_workflow, out_prefix=None):
177 """Create submission for a generic workflow for a specific WMS.
179 Parameters
180 ----------
181 config : `lsst.ctrl.bps.BpsConfig`
182 BPS configuration.
183 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
184 Generic representation of a single workflow.
185 out_prefix : `str`
186 Prefix for all WMS output files.
188 Returns
189 -------
190 wms_workflow : `lsst.ctrl.bps.BaseWmsWorkflow`
191 Prepared WMS Workflow to submit for execution.
192 """
193 raise NotImplementedError
195 def submit(self, workflow):
196 """Submit a single WMS workflow.
198 Parameters
199 ----------
200 workflow : `lsst.ctrl.bps.BaseWmsWorkflow`
201 Prepared WMS Workflow to submit for execution.
202 """
203 raise NotImplementedError
205 def restart(self, wms_workflow_id):
206 """Restart a workflow from the point of failure.
208 Parameters
209 ----------
210 wms_workflow_id : `str`
211 Id that can be used by WMS service to identify workflow that
212 need to be restarted.
214 Returns
215 -------
216 wms_id : `str`
217 Id of the restarted workflow. If restart failed, it will be set
218 to `None`.
219 run_name : `str`
220 Name of the restarted workflow. If restart failed, it will be set
221 to `None`.
222 message : `str`
223 A message describing any issues encountered during the restart.
224 If there were no issue, an empty string is returned.
225 """
226 raise NotImplementedError
228 def list_submitted_jobs(self, wms_id=None, user=None, require_bps=True, pass_thru=None, is_global=False):
229 """Query WMS for list of submitted WMS workflows/jobs.
231 This should be a quick lookup function to create list of jobs for
232 other functions.
234 Parameters
235 ----------
236 wms_id : `int` or `str`, optional
237 Id or path that can be used by WMS service to look up job.
238 user : `str`, optional
239 User whose submitted jobs should be listed.
240 require_bps : `bool`, optional
241 Whether to require jobs returned in list to be bps-submitted jobs.
242 pass_thru : `str`, optional
243 Information to pass through to WMS.
244 is_global : `bool`, optional
245 If set, all available job queues will be queried for job
246 information. Defaults to False which means that only a local job
247 queue will be queried for information.
249 Only applicable in the context of a WMS using distributed job
250 queues (e.g., HTCondor). A WMS with a centralized job queue
251 (e.g. PanDA) can safely ignore it.
253 Returns
254 -------
255 job_ids : `list` [`Any`]
256 Only job ids to be used by cancel and other functions. Typically
257 this means top-level jobs (i.e., not children jobs).
258 """
259 raise NotImplementedError
261 def report(
262 self,
263 wms_workflow_id=None,
264 user=None,
265 hist=0,
266 pass_thru=None,
267 is_global=False,
268 return_exit_codes=False,
269 ):
270 """Query WMS for status of submitted WMS workflows.
272 Parameters
273 ----------
274 wms_workflow_id : `int` or `str`, optional
275 Id that can be used by WMS service to look up status.
276 user : `str`, optional
277 Limit report to submissions by this particular user.
278 hist : `int`, optional
279 Number of days to expand report to include finished WMS workflows.
280 pass_thru : `str`, optional
281 Additional arguments to pass through to the specific WMS service.
282 is_global : `bool`, optional
283 If set, all available job queues will be queried for job
284 information. Defaults to False which means that only a local job
285 queue will be queried for information.
287 Only applicable in the context of a WMS using distributed job
288 queues (e.g., HTCondor). A WMS with a centralized job queue
289 (e.g. PanDA) can safely ignore it.
290 return_exit_codes : `bool`, optional
291 If set, return exit codes related to jobs with a
292 non-success status. Defaults to False, which means that only
293 the summary state is returned.
295 Only applicable in the context of a WMS with associated
296 handlers to return exit codes from jobs.
298 Returns
299 -------
300 run_reports : `list` [`lsst.ctrl.bps.WmsRunReport`]
301 Status information for submitted WMS workflows.
302 message : `str`
303 Message to user on how to find more status information specific to
304 this particular WMS.
305 """
306 raise NotImplementedError
308 def cancel(self, wms_id, pass_thru=None):
309 """Cancel submitted workflows/jobs.
311 Parameters
312 ----------
313 wms_id : `str`
314 ID or path of job that should be canceled.
315 pass_thru : `str`, optional
316 Information to pass through to WMS.
318 Returns
319 -------
320 deleted : `bool`
321 Whether successful deletion or not. Currently, if any doubt or any
322 individual jobs not deleted, return False.
323 message : `str`
324 Any message from WMS (e.g., error details).
325 """
326 raise NotImplementedError
328 def run_submission_checks(self):
329 """Check to run at start if running WMS specific submission steps.
331 Any exception other than NotImplementedError will halt submission.
332 Submit directory may not yet exist when this is called.
333 """
334 raise NotImplementedError
336 def ping(self, pass_thru):
337 """Check whether WMS services are up, reachable, and can authenticate
338 if authentication is required.
340 The services to be checked are those needed for submit, report, cancel,
341 restart, but ping cannot guarantee whether jobs would actually run
342 successfully.
344 Parameters
345 ----------
346 pass_thru : `str`, optional
347 Information to pass through to WMS.
349 Returns
350 -------
351 status : `int`
352 0 for success, non-zero for failure.
353 message : `str`
354 Any message from WMS (e.g., error details).
355 """
356 raise NotImplementedError
359class BaseWmsWorkflow(metaclass=ABCMeta):
360 """Interface for single workflow specific to a WMS.
362 Parameters
363 ----------
364 name : `str`
365 Unique name of workflow.
366 config : `lsst.ctrl.bps.BpsConfig`
367 Generic workflow config.
368 """
370 def __init__(self, name, config):
371 self.name = name
372 self.config = config
373 self.service_class = None
374 self.run_id = None
375 self.submit_path = None
377 @classmethod
378 def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class):
379 """Create a WMS-specific workflow from a GenericWorkflow.
381 Parameters
382 ----------
383 config : `lsst.ctrl.bps.BpsConfig`
384 Configuration values needed for generating a WMS specific workflow.
385 generic_workflow : `lsst.ctrl.bps.GenericWorkflow`
386 Generic workflow from which to create the WMS-specific one.
387 out_prefix : `str`
388 Root directory to be used for WMS workflow inputs and outputs
389 as well as internal WMS files.
390 service_class : `str`
391 Full module name of WMS service class that created this workflow.
393 Returns
394 -------
395 wms_workflow : `lsst.ctrl.bps.BaseWmsWorkflow`
396 A WMS specific workflow.
397 """
398 raise NotImplementedError
400 def write(self, out_prefix):
401 """Write WMS files for this particular workflow.
403 Parameters
404 ----------
405 out_prefix : `str`
406 Root directory to be used for WMS workflow inputs and outputs
407 as well as internal WMS files.
408 """
409 raise NotImplementedError