Coverage for python/lsst/ctrl/bps/drivers.py: 11%
212 statements
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-08 10:12 +0000
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-08 10:12 +0000
1# This file is part of ctrl_bps.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Driver functions for each subcommand.
30Driver functions ensure that ensure all setup work is done before running
31the subcommand method.
32"""
35__all__ = [
36 "acquire_qgraph_driver",
37 "cluster_qgraph_driver",
38 "transform_driver",
39 "prepare_driver",
40 "submit_driver",
41 "report_driver",
42 "restart_driver",
43 "cancel_driver",
44 "ping_driver",
45]
48import errno
49import getpass
50import logging
51import os
52import re
53import shutil
54from collections.abc import Iterable
55from pathlib import Path
57from lsst.pipe.base import Instrument
58from lsst.utils import doImport
59from lsst.utils.timer import time_this
60from lsst.utils.usage import get_peak_mem_usage
62from . import BPS_DEFAULTS, BPS_SEARCH_ORDER, DEFAULT_MEM_FMT, DEFAULT_MEM_UNIT, BpsConfig
63from .bps_utils import _dump_env_info, _dump_pkg_info
64from .cancel import cancel
65from .ping import ping
66from .pre_transform import acquire_quantum_graph, cluster_quanta
67from .prepare import prepare
68from .report import report
69from .restart import restart
70from .submit import submit
71from .transform import transform
73_LOG = logging.getLogger(__name__)
76def _init_submission_driver(config_file, **kwargs):
77 """Initialize runtime environment.
79 Parameters
80 ----------
81 config_file : `str`
82 Name of the configuration file.
84 Returns
85 -------
86 config : `lsst.ctrl.bps.BpsConfig`
87 Batch Processing Service configuration.
88 """
89 config = BpsConfig(config_file, BPS_SEARCH_ORDER)
91 # Override config with command-line values.
92 # Handle diffs between pipetask argument names vs bps yaml
93 translation = {
94 "input": "inCollection",
95 "output_run": "outputRun",
96 "qgraph": "qgraphFile",
97 "pipeline": "pipelineYaml",
98 "wms_service": "wmsServiceClass",
99 "compute_site": "computeSite",
100 }
101 for key, value in kwargs.items():
102 # Don't want to override config with None or empty string values.
103 if value:
104 # pipetask argument parser converts some values to list,
105 # but bps will want string.
106 if not isinstance(value, str) and isinstance(value, Iterable):
107 value = ",".join(value)
108 new_key = translation.get(key, re.sub(r"_(\S)", lambda match: match.group(1).upper(), key))
109 config[f".bps_cmdline.{new_key}"] = value
111 # If the WMS service class was not defined neither at the command line nor
112 # explicitly in config file, use the value provided by the environmental
113 # variable BPS_WMS_SERVICE_CLASS. If the variable is not set, stick to
114 # the package default.
115 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", None)
116 if wms_service is not None and "wmsServiceClass" not in config[".bps_cmdline"]:
117 default_config = BpsConfig(BPS_DEFAULTS)
118 if config["wmsServiceClass"] == default_config["wmsServiceClass"]:
119 config["wmsServiceClass"] = wms_service
121 # Set some initial values
122 config[".bps_defined.timestamp"] = Instrument.makeCollectionTimestamp()
123 if "operator" not in config:
124 config[".bps_defined.operator"] = getpass.getuser()
126 if "outCollection" in config:
127 raise KeyError("outCollection is deprecated. Replace all outCollection references with outputRun.")
129 if "outputRun" not in config:
130 raise KeyError("Must specify the output run collection using outputRun")
132 if "uniqProcName" not in config:
133 config[".bps_defined.uniqProcName"] = config["outputRun"].replace("/", "_")
135 if "submitPath" not in config:
136 raise KeyError("Must specify the submit-side run directory using submitPath")
138 # If requested, run WMS plugin checks early in submission process to
139 # ensure WMS has what it will need for prepare() or submit().
140 if kwargs.get("runWmsSubmissionChecks", False):
141 found, wms_class = config.search("wmsServiceClass")
142 if not found:
143 raise KeyError("Missing wmsServiceClass in bps config. Aborting.")
145 # Check that can import wms service class.
146 wms_service_class = doImport(wms_class)
147 wms_service = wms_service_class(config)
149 try:
150 wms_service.run_submission_checks()
151 except NotImplementedError:
152 # Allow various plugins to implement only when needed to do extra
153 # checks.
154 _LOG.debug("run_submission_checks is not implemented in %s.", wms_class)
155 else:
156 _LOG.debug("Skipping submission checks.")
158 # Make submit directory to contain all outputs.
159 submit_path = Path(config["submitPath"])
160 try:
161 submit_path.mkdir(parents=True, exist_ok=False)
162 except OSError as exc:
163 if exc.errno == errno.EEXIST:
164 reason = "Directory already exists"
165 else:
166 reason = exc.strerror
167 raise type(exc)(f"cannot create submit directory '{submit_path}': {reason}") from None
168 config[".bps_defined.submitPath"] = str(submit_path)
169 print(f"Submit dir: {submit_path}")
171 # save copy of configs (orig and expanded config)
172 shutil.copy2(config_file, submit_path)
173 with open(f"{submit_path}/{config['uniqProcName']}_config.yaml", "w") as fh:
174 config.dump(fh)
176 # Dump information about runtime environment and software versions in use.
177 _dump_env_info(f"{submit_path}/{config['uniqProcName']}.env.info.yaml")
178 _dump_pkg_info(f"{submit_path}/{config['uniqProcName']}.pkg.info.yaml")
180 return config
183def acquire_qgraph_driver(config_file, **kwargs):
184 """Read a quantum graph from a file or create one from pipeline definition.
186 Parameters
187 ----------
188 config_file : `str`
189 Name of the configuration file.
190 **kwargs : `~typing.Any`
191 Additional modifiers to the configuration.
193 Returns
194 -------
195 config : `lsst.ctrl.bps.BpsConfig`
196 Updated configuration.
197 qgraph : `lsst.pipe.base.graph.QuantumGraph`
198 A graph representing quanta.
199 """
200 _LOG.info("Initializing execution environment")
201 with time_this(
202 log=_LOG,
203 level=logging.INFO,
204 prefix=None,
205 msg="Initializing execution environment completed",
206 mem_usage=True,
207 mem_unit=DEFAULT_MEM_UNIT,
208 mem_fmt=DEFAULT_MEM_FMT,
209 ):
210 config = _init_submission_driver(config_file, **kwargs)
211 submit_path = config[".bps_defined.submitPath"]
212 if _LOG.isEnabledFor(logging.INFO):
213 _LOG.info(
214 "Peak memory usage for bps process %s (main), %s (largest child process)",
215 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
216 )
218 _LOG.info("Starting acquire stage (generating and/or reading quantum graph)")
219 with time_this(
220 log=_LOG,
221 level=logging.INFO,
222 prefix=None,
223 msg="Acquire stage completed",
224 mem_usage=True,
225 mem_unit=DEFAULT_MEM_UNIT,
226 mem_fmt=DEFAULT_MEM_FMT,
227 ):
228 qgraph_file, qgraph, execution_butler_dir = acquire_quantum_graph(config, out_prefix=submit_path)
229 if _LOG.isEnabledFor(logging.INFO):
230 _LOG.info(
231 "Peak memory usage for bps process %s (main), %s (largest child process)",
232 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
233 )
235 # When using QBB (and neither 'executionButlerTemplate' nor
236 # 'executionButlerDir' is set) acquire_quantum_graph() will set
237 # 'execution_butler_dir' to the submit directory. This will trick
238 # 'ctrl_bps_parsl' to use a non-existent execution butler and the run will
239 # fail. See ParslJob.get_command_line() for details.
240 #
241 # This simple trick should keep 'ctrl_bps_parsl' working for the time being
242 # without making more complex changes in the logic which will be removed
243 # soon anyway (see DM-40342).
244 if os.path.normpath(execution_butler_dir) != os.path.normpath(submit_path):
245 config[".bps_defined.executionButlerDir"] = execution_butler_dir
246 config[".bps_defined.runQgraphFile"] = qgraph_file
247 return config, qgraph
250def cluster_qgraph_driver(config_file, **kwargs):
251 """Group quanta into clusters.
253 Parameters
254 ----------
255 config_file : `str`
256 Name of the configuration file.
257 **kwargs : `~typing.Any`
258 Additional modifiers to the configuration.
260 Returns
261 -------
262 config : `lsst.ctrl.bps.BpsConfig`
263 Updated configuration.
264 clustered_qgraph : `lsst.ctrl.bps.ClusteredQuantumGraph`
265 A graph representing clustered quanta.
266 """
267 config, qgraph = acquire_qgraph_driver(config_file, **kwargs)
269 _LOG.info("Starting cluster stage (grouping quanta into jobs)")
270 with time_this(
271 log=_LOG,
272 level=logging.INFO,
273 prefix=None,
274 msg="Cluster stage completed",
275 mem_usage=True,
276 mem_unit=DEFAULT_MEM_UNIT,
277 mem_fmt=DEFAULT_MEM_FMT,
278 ):
279 clustered_qgraph = cluster_quanta(config, qgraph, config["uniqProcName"])
280 if _LOG.isEnabledFor(logging.INFO):
281 _LOG.info(
282 "Peak memory usage for bps process %s (main), %s (largest child process)",
283 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
284 )
285 _LOG.info("ClusteredQuantumGraph contains %d cluster(s)", len(clustered_qgraph))
287 submit_path = config[".bps_defined.submitPath"]
288 _, save_clustered_qgraph = config.search("saveClusteredQgraph", opt={"default": False})
289 if save_clustered_qgraph:
290 clustered_qgraph.save(os.path.join(submit_path, "bps_clustered_qgraph.pickle"))
291 _, save_dot = config.search("saveDot", opt={"default": False})
292 if save_dot:
293 clustered_qgraph.draw(os.path.join(submit_path, "bps_clustered_qgraph.dot"))
294 return config, clustered_qgraph
297def transform_driver(config_file, **kwargs):
298 """Create a workflow for a specific workflow management system.
300 Parameters
301 ----------
302 config_file : `str`
303 Name of the configuration file.
304 **kwargs : `~typing.Any`
305 Additional modifiers to the configuration.
307 Returns
308 -------
309 generic_workflow_config : `lsst.ctrl.bps.BpsConfig`
310 Configuration to use when creating the workflow.
311 generic_workflow : `lsst.ctrl.bps.BaseWmsWorkflow`
312 Representation of the abstract/scientific workflow specific to a given
313 workflow management system.
314 """
315 config, clustered_qgraph = cluster_qgraph_driver(config_file, **kwargs)
316 submit_path = config[".bps_defined.submitPath"]
318 _LOG.info("Starting transform stage (creating generic workflow)")
319 with time_this(
320 log=_LOG,
321 level=logging.INFO,
322 prefix=None,
323 msg="Transform stage completed",
324 mem_usage=True,
325 mem_unit=DEFAULT_MEM_UNIT,
326 mem_fmt=DEFAULT_MEM_FMT,
327 ):
328 generic_workflow, generic_workflow_config = transform(config, clustered_qgraph, submit_path)
329 _LOG.info("Generic workflow name '%s'", generic_workflow.name)
330 if _LOG.isEnabledFor(logging.INFO):
331 _LOG.info(
332 "Peak memory usage for bps process %s (main), %s (largest child process)",
333 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
334 )
335 num_jobs = sum(generic_workflow.job_counts.values())
336 _LOG.info("GenericWorkflow contains %d job(s) (including final)", num_jobs)
338 _, save_workflow = config.search("saveGenericWorkflow", opt={"default": False})
339 if save_workflow:
340 with open(os.path.join(submit_path, "bps_generic_workflow.pickle"), "wb") as outfh:
341 generic_workflow.save(outfh, "pickle")
342 _, save_dot = config.search("saveDot", opt={"default": False})
343 if save_dot:
344 with open(os.path.join(submit_path, "bps_generic_workflow.dot"), "w") as outfh:
345 generic_workflow.draw(outfh, "dot")
346 return generic_workflow_config, generic_workflow
349def prepare_driver(config_file, **kwargs):
350 """Create a representation of the generic workflow.
352 Parameters
353 ----------
354 config_file : `str`
355 Name of the configuration file.
356 **kwargs : `~typing.Any`
357 Additional modifiers to the configuration.
359 Returns
360 -------
361 wms_config : `lsst.ctrl.bps.BpsConfig`
362 Configuration to use when creating the workflow.
363 workflow : `lsst.ctrl.bps.BaseWmsWorkflow`
364 Representation of the abstract/scientific workflow specific to a given
365 workflow management system.
366 """
367 kwargs.setdefault("runWmsSubmissionChecks", True)
368 generic_workflow_config, generic_workflow = transform_driver(config_file, **kwargs)
369 submit_path = generic_workflow_config[".bps_defined.submitPath"]
371 _LOG.info("Starting prepare stage (creating specific implementation of workflow)")
372 with time_this(
373 log=_LOG,
374 level=logging.INFO,
375 prefix=None,
376 msg="Prepare stage completed",
377 mem_usage=True,
378 mem_unit=DEFAULT_MEM_UNIT,
379 mem_fmt=DEFAULT_MEM_FMT,
380 ):
381 wms_workflow = prepare(generic_workflow_config, generic_workflow, submit_path)
382 if _LOG.isEnabledFor(logging.INFO):
383 _LOG.info(
384 "Peak memory usage for bps process %s (main), %s (largest child process)",
385 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
386 )
388 wms_workflow_config = generic_workflow_config
389 return wms_workflow_config, wms_workflow
392def submit_driver(config_file, **kwargs):
393 """Submit workflow for execution.
395 Parameters
396 ----------
397 config_file : `str`
398 Name of the configuration file.
399 **kwargs : `~typing.Any`
400 Additional modifiers to the configuration.
401 """
402 kwargs.setdefault("runWmsSubmissionChecks", True)
404 _LOG.info(
405 "DISCLAIMER: All values regarding memory consumption reported below are approximate and may "
406 "not accurately reflect actual memory usage by the bps process."
407 )
409 remote_build = {}
410 config = BpsConfig(config_file, BPS_SEARCH_ORDER)
411 _, remote_build = config.search("remoteBuild", opt={"default": {}})
412 if remote_build:
413 if config["wmsServiceClass"] == "lsst.ctrl.bps.panda.PanDAService":
414 if not remote_build.search("enabled", opt={"default": False})[1]:
415 remote_build = {}
416 _LOG.info("The workflow is sumitted to the local Data Facility.")
417 else:
418 _LOG.info("Remote submission is enabled. The workflow is sumitted to a remote Data Facility.")
419 _LOG.info("Initializing execution environment")
420 with time_this(
421 log=_LOG,
422 level=logging.INFO,
423 prefix=None,
424 msg="Initializing execution environment completed",
425 mem_usage=True,
426 mem_unit=DEFAULT_MEM_UNIT,
427 mem_fmt=DEFAULT_MEM_FMT,
428 ):
429 config = _init_submission_driver(config_file, **kwargs)
430 kwargs["remote_build"] = remote_build
431 kwargs["config_file"] = config_file
432 wms_workflow = None
433 else:
434 _LOG.info("The workflow is sumitted to the local Data Facility.")
436 _LOG.info("Starting submission process")
437 with time_this(
438 log=_LOG,
439 level=logging.INFO,
440 prefix=None,
441 msg="Completed entire submission process",
442 mem_usage=True,
443 mem_unit=DEFAULT_MEM_UNIT,
444 mem_fmt=DEFAULT_MEM_FMT,
445 ):
446 if not remote_build:
447 wms_workflow_config, wms_workflow = prepare_driver(config_file, **kwargs)
448 else:
449 wms_workflow_config = config
451 _LOG.info("Starting submit stage")
452 with time_this(
453 log=_LOG,
454 level=logging.INFO,
455 prefix=None,
456 msg="Completed submit stage",
457 mem_usage=True,
458 mem_unit=DEFAULT_MEM_UNIT,
459 mem_fmt=DEFAULT_MEM_FMT,
460 ):
461 workflow = submit(wms_workflow_config, wms_workflow, **kwargs)
462 if not wms_workflow:
463 wms_workflow = workflow
464 _LOG.info("Run '%s' submitted for execution with id '%s'", wms_workflow.name, wms_workflow.run_id)
465 if _LOG.isEnabledFor(logging.INFO):
466 _LOG.info(
467 "Peak memory usage for bps process %s (main), %s (largest child process)",
468 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
469 )
471 print(f"Run Id: {wms_workflow.run_id}")
472 print(f"Run Name: {wms_workflow.name}")
475def restart_driver(wms_service, run_id):
476 """Restart a failed workflow.
478 Parameters
479 ----------
480 wms_service : `str`
481 Name of the class.
482 run_id : `str`
483 Id or path of workflow that need to be restarted.
484 """
485 if wms_service is None:
486 default_config = BpsConfig(BPS_DEFAULTS)
487 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
489 new_run_id, run_name, message = restart(wms_service, run_id)
490 if new_run_id is not None:
491 path = Path(run_id)
492 if path.exists():
493 _dump_env_info(f"{run_id}/{run_name}.env.info.yaml")
494 _dump_pkg_info(f"{run_id}/{run_name}.pkg.info.yaml")
495 print(f"Run Id: {new_run_id}")
496 print(f"Run Name: {run_name}")
497 else:
498 if message:
499 print(f"Restart failed: {message}")
500 else:
501 print("Restart failed: Unknown error")
504def report_driver(wms_service, run_id, user, hist_days, pass_thru, is_global=False, return_exit_codes=False):
505 """Print out summary of jobs submitted for execution.
507 Parameters
508 ----------
509 wms_service : `str`
510 Name of the class.
511 run_id : `str`
512 A run id the report will be restricted to.
513 user : `str`
514 A user name the report will be restricted to.
515 hist_days : int
516 Number of days.
517 pass_thru : `str`
518 A string to pass directly to the WMS service class.
519 is_global : `bool`, optional
520 If set, all available job queues will be queried for job information.
521 Defaults to False which means that only a local job queue will be
522 queried for information.
524 Only applicable in the context of a WMS using distributed job queues
525 (e.g., HTCondor).
526 return_exit_codes : `bool`, optional
527 If set, return exit codes related to jobs with a
528 non-success status. Defaults to False, which means that only
529 the summary state is returned.
531 Only applicable in the context of a WMS with associated
532 handlers to return exit codes from jobs.
533 """
534 if wms_service is None:
535 default_config = BpsConfig(BPS_DEFAULTS)
536 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
537 report(
538 wms_service,
539 run_id,
540 user,
541 hist_days,
542 pass_thru,
543 is_global=is_global,
544 return_exit_codes=return_exit_codes,
545 )
548def cancel_driver(wms_service, run_id, user, require_bps, pass_thru, is_global=False):
549 """Cancel submitted workflows.
551 Parameters
552 ----------
553 wms_service : `str`
554 Name of the Workload Management System service class.
555 run_id : `str`
556 ID or path of job that should be canceled.
557 user : `str`
558 User whose submitted jobs should be canceled.
559 require_bps : `bool`
560 Whether to require given run_id/user to be a bps submitted job.
561 pass_thru : `str`
562 Information to pass through to WMS.
563 is_global : `bool`, optional
564 If set, all available job queues will be checked for jobs to cancel.
565 Defaults to False which means that only a local job queue will be
566 checked.
568 Only applicable in the context of a WMS using distributed job queues
569 (e.g., HTCondor).
570 """
571 if wms_service is None:
572 default_config = BpsConfig(BPS_DEFAULTS)
573 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
574 cancel(wms_service, run_id, user, require_bps, pass_thru, is_global=is_global)
577def ping_driver(wms_service=None, pass_thru=None):
578 """Check whether WMS services are up, reachable, and any authentication,
579 if needed, succeeds.
581 The services to be checked are those needed for submit, report, cancel,
582 restart, but ping cannot guarantee whether jobs would actually run
583 successfully.
585 Parameters
586 ----------
587 wms_service : `str`, optional
588 Name of the Workload Management System service class.
589 pass_thru : `str`, optional
590 Information to pass through to WMS.
592 Returns
593 -------
594 success : `int`
595 Whether services are up and usable (0) or not (non-zero).
596 """
597 if wms_service is None:
598 default_config = BpsConfig(BPS_DEFAULTS)
599 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
600 status, message = ping(wms_service, pass_thru)
602 if message:
603 if not status:
604 _LOG.info(message)
605 else:
606 _LOG.error(message)
608 # Log overall status message
609 if not status:
610 _LOG.info("Ping successful.")
611 else:
612 _LOG.error("Ping failed (%d).", status)
614 return status