Coverage for python/lsst/ctrl/bps/drivers.py: 12%
192 statements
« prev ^ index » next coverage.py v7.3.0, created at 2023-09-02 09:44 +0000
« prev ^ index » next coverage.py v7.3.0, created at 2023-09-02 09:44 +0000
1# This file is part of ctrl_bps.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Driver functions for each subcommand.
24Driver functions ensure that ensure all setup work is done before running
25the subcommand method.
26"""
29__all__ = [
30 "acquire_qgraph_driver",
31 "cluster_qgraph_driver",
32 "transform_driver",
33 "prepare_driver",
34 "submit_driver",
35 "report_driver",
36 "restart_driver",
37 "cancel_driver",
38 "ping_driver",
39]
42import errno
43import getpass
44import logging
45import os
46import re
47import shutil
48from collections.abc import Iterable
49from pathlib import Path
51from lsst.pipe.base import Instrument
52from lsst.utils import doImport
53from lsst.utils.timer import time_this
54from lsst.utils.usage import get_peak_mem_usage
56from . import BPS_DEFAULTS, BPS_SEARCH_ORDER, DEFAULT_MEM_FMT, DEFAULT_MEM_UNIT, BpsConfig
57from .bps_utils import _dump_env_info, _dump_pkg_info
58from .cancel import cancel
59from .ping import ping
60from .pre_transform import acquire_quantum_graph, cluster_quanta
61from .prepare import prepare
62from .report import report
63from .restart import restart
64from .submit import submit
65from .transform import transform
67_LOG = logging.getLogger(__name__)
70def _init_submission_driver(config_file, **kwargs):
71 """Initialize runtime environment.
73 Parameters
74 ----------
75 config_file : `str`
76 Name of the configuration file.
78 Returns
79 -------
80 config : `lsst.ctrl.bps.BpsConfig`
81 Batch Processing Service configuration.
82 """
83 config = BpsConfig(config_file, BPS_SEARCH_ORDER)
85 # Override config with command-line values.
86 # Handle diffs between pipetask argument names vs bps yaml
87 translation = {
88 "input": "inCollection",
89 "output_run": "outputRun",
90 "qgraph": "qgraphFile",
91 "pipeline": "pipelineYaml",
92 "wms_service": "wmsServiceClass",
93 "compute_site": "computeSite",
94 }
95 for key, value in kwargs.items():
96 # Don't want to override config with None or empty string values.
97 if value:
98 # pipetask argument parser converts some values to list,
99 # but bps will want string.
100 if not isinstance(value, str) and isinstance(value, Iterable):
101 value = ",".join(value)
102 new_key = translation.get(key, re.sub(r"_(\S)", lambda match: match.group(1).upper(), key))
103 config[f".bps_cmdline.{new_key}"] = value
105 # If the WMS service class was not defined neither at the command line nor
106 # explicitly in config file, use the value provided by the environmental
107 # variable BPS_WMS_SERVICE_CLASS. If the variable is not set, stick to
108 # the package default.
109 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", None)
110 if wms_service is not None and "wmsServiceClass" not in config[".bps_cmdline"]:
111 default_config = BpsConfig(BPS_DEFAULTS)
112 if config["wmsServiceClass"] == default_config["wmsServiceClass"]:
113 config["wmsServiceClass"] = wms_service
115 # Set some initial values
116 config[".bps_defined.timestamp"] = Instrument.makeCollectionTimestamp()
117 if "operator" not in config:
118 config[".bps_defined.operator"] = getpass.getuser()
120 if "outCollection" in config:
121 raise KeyError("outCollection is deprecated. Replace all outCollection references with outputRun.")
123 if "outputRun" not in config:
124 raise KeyError("Must specify the output run collection using outputRun")
126 if "uniqProcName" not in config:
127 config[".bps_defined.uniqProcName"] = config["outputRun"].replace("/", "_")
129 if "submitPath" not in config:
130 raise KeyError("Must specify the submit-side run directory using submitPath")
132 # If requested, run WMS plugin checks early in submission process to
133 # ensure WMS has what it will need for prepare() or submit().
134 if kwargs.get("runWmsSubmissionChecks", False):
135 found, wms_class = config.search("wmsServiceClass")
136 if not found:
137 raise KeyError("Missing wmsServiceClass in bps config. Aborting.")
139 # Check that can import wms service class.
140 wms_service_class = doImport(wms_class)
141 wms_service = wms_service_class(config)
143 try:
144 wms_service.run_submission_checks()
145 except NotImplementedError:
146 # Allow various plugins to implement only when needed to do extra
147 # checks.
148 _LOG.debug("run_submission_checks is not implemented in %s.", wms_class)
149 else:
150 _LOG.debug("Skipping submission checks.")
152 # Make submit directory to contain all outputs.
153 submit_path = Path(config["submitPath"])
154 try:
155 submit_path.mkdir(parents=True, exist_ok=False)
156 except OSError as exc:
157 if exc.errno == errno.EEXIST:
158 reason = "Directory already exists"
159 else:
160 reason = exc.strerror
161 raise type(exc)(f"cannot create submit directory '{submit_path}': {reason}") from None
162 config[".bps_defined.submitPath"] = str(submit_path)
163 print(f"Submit dir: {submit_path}")
165 # save copy of configs (orig and expanded config)
166 shutil.copy2(config_file, submit_path)
167 with open(f"{submit_path}/{config['uniqProcName']}_config.yaml", "w") as fh:
168 config.dump(fh)
170 # Dump information about runtime environment and software versions in use.
171 _dump_env_info(f"{submit_path}/{config['uniqProcName']}.env.info.yaml")
172 _dump_pkg_info(f"{submit_path}/{config['uniqProcName']}.pkg.info.yaml")
174 return config
177def acquire_qgraph_driver(config_file, **kwargs):
178 """Read a quantum graph from a file or create one from pipeline definition.
180 Parameters
181 ----------
182 config_file : `str`
183 Name of the configuration file.
185 Returns
186 -------
187 config : `lsst.ctrl.bps.BpsConfig`
188 Updated configuration.
189 qgraph : `lsst.pipe.base.graph.QuantumGraph`
190 A graph representing quanta.
191 """
192 _LOG.info("Initializing execution environment")
193 with time_this(
194 log=_LOG,
195 level=logging.INFO,
196 prefix=None,
197 msg="Initializing execution environment completed",
198 mem_usage=True,
199 mem_unit=DEFAULT_MEM_UNIT,
200 mem_fmt=DEFAULT_MEM_FMT,
201 ):
202 config = _init_submission_driver(config_file, **kwargs)
203 submit_path = config[".bps_defined.submitPath"]
204 if _LOG.isEnabledFor(logging.INFO):
205 _LOG.info(
206 "Peak memory usage for bps process %s (main), %s (largest child process)",
207 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
208 )
210 _LOG.info("Starting acquire stage (generating and/or reading quantum graph)")
211 with time_this(
212 log=_LOG,
213 level=logging.INFO,
214 prefix=None,
215 msg="Acquire stage completed",
216 mem_usage=True,
217 mem_unit=DEFAULT_MEM_UNIT,
218 mem_fmt=DEFAULT_MEM_FMT,
219 ):
220 qgraph_file, qgraph, execution_butler_dir = acquire_quantum_graph(config, out_prefix=submit_path)
221 if _LOG.isEnabledFor(logging.INFO):
222 _LOG.info(
223 "Peak memory usage for bps process %s (main), %s (largest child process)",
224 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
225 )
227 # When using QBB (and neither 'executionButlerTemplate' nor
228 # 'executionButlerDir' is set) acquire_quantum_graph() will set
229 # 'execution_butler_dir' to the submit directory. This will trick
230 # 'ctrl_bps_parsl' to use a non-existent execution butler and the run will
231 # fail. See ParslJob.get_command_line() for details.
232 #
233 # This simple trick should keep 'ctrl_bps_parsl' working for the time being
234 # without making more complex changes in the logic which will be removed
235 # soon anyway (see DM-40342).
236 if os.path.normpath(execution_butler_dir) != os.path.normpath(submit_path):
237 config[".bps_defined.executionButlerDir"] = execution_butler_dir
238 config[".bps_defined.runQgraphFile"] = qgraph_file
239 return config, qgraph
242def cluster_qgraph_driver(config_file, **kwargs):
243 """Group quanta into clusters.
245 Parameters
246 ----------
247 config_file : `str`
248 Name of the configuration file.
250 Returns
251 -------
252 config : `lsst.ctrl.bps.BpsConfig`
253 Updated configuration.
254 clustered_qgraph : `lsst.ctrl.bps.ClusteredQuantumGraph`
255 A graph representing clustered quanta.
256 """
257 config, qgraph = acquire_qgraph_driver(config_file, **kwargs)
259 _LOG.info("Starting cluster stage (grouping quanta into jobs)")
260 with time_this(
261 log=_LOG,
262 level=logging.INFO,
263 prefix=None,
264 msg="Cluster stage completed",
265 mem_usage=True,
266 mem_unit=DEFAULT_MEM_UNIT,
267 mem_fmt=DEFAULT_MEM_FMT,
268 ):
269 clustered_qgraph = cluster_quanta(config, qgraph, config["uniqProcName"])
270 if _LOG.isEnabledFor(logging.INFO):
271 _LOG.info(
272 "Peak memory usage for bps process %s (main), %s (largest child process)",
273 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
274 )
275 _LOG.info("ClusteredQuantumGraph contains %d cluster(s)", len(clustered_qgraph))
277 submit_path = config[".bps_defined.submitPath"]
278 _, save_clustered_qgraph = config.search("saveClusteredQgraph", opt={"default": False})
279 if save_clustered_qgraph:
280 clustered_qgraph.save(os.path.join(submit_path, "bps_clustered_qgraph.pickle"))
281 _, save_dot = config.search("saveDot", opt={"default": False})
282 if save_dot:
283 clustered_qgraph.draw(os.path.join(submit_path, "bps_clustered_qgraph.dot"))
284 return config, clustered_qgraph
287def transform_driver(config_file, **kwargs):
288 """Create a workflow for a specific workflow management system.
290 Parameters
291 ----------
292 config_file : `str`
293 Name of the configuration file.
295 Returns
296 -------
297 generic_workflow_config : `lsst.ctrl.bps.BpsConfig`
298 Configuration to use when creating the workflow.
299 generic_workflow : `lsst.ctrl.bps.BaseWmsWorkflow`
300 Representation of the abstract/scientific workflow specific to a given
301 workflow management system.
302 """
303 config, clustered_qgraph = cluster_qgraph_driver(config_file, **kwargs)
304 submit_path = config[".bps_defined.submitPath"]
306 _LOG.info("Starting transform stage (creating generic workflow)")
307 with time_this(
308 log=_LOG,
309 level=logging.INFO,
310 prefix=None,
311 msg="Transform stage completed",
312 mem_usage=True,
313 mem_unit=DEFAULT_MEM_UNIT,
314 mem_fmt=DEFAULT_MEM_FMT,
315 ):
316 generic_workflow, generic_workflow_config = transform(config, clustered_qgraph, submit_path)
317 _LOG.info("Generic workflow name '%s'", generic_workflow.name)
318 if _LOG.isEnabledFor(logging.INFO):
319 _LOG.info(
320 "Peak memory usage for bps process %s (main), %s (largest child process)",
321 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
322 )
323 num_jobs = sum(generic_workflow.job_counts.values())
324 _LOG.info("GenericWorkflow contains %d job(s) (including final)", num_jobs)
326 _, save_workflow = config.search("saveGenericWorkflow", opt={"default": False})
327 if save_workflow:
328 with open(os.path.join(submit_path, "bps_generic_workflow.pickle"), "wb") as outfh:
329 generic_workflow.save(outfh, "pickle")
330 _, save_dot = config.search("saveDot", opt={"default": False})
331 if save_dot:
332 with open(os.path.join(submit_path, "bps_generic_workflow.dot"), "w") as outfh:
333 generic_workflow.draw(outfh, "dot")
334 return generic_workflow_config, generic_workflow
337def prepare_driver(config_file, **kwargs):
338 """Create a representation of the generic workflow.
340 Parameters
341 ----------
342 config_file : `str`
343 Name of the configuration file.
345 Returns
346 -------
347 wms_config : `lsst.ctrl.bps.BpsConfig`
348 Configuration to use when creating the workflow.
349 workflow : `lsst.ctrl.bps.BaseWmsWorkflow`
350 Representation of the abstract/scientific workflow specific to a given
351 workflow management system.
352 """
353 kwargs.setdefault("runWmsSubmissionChecks", True)
354 generic_workflow_config, generic_workflow = transform_driver(config_file, **kwargs)
355 submit_path = generic_workflow_config[".bps_defined.submitPath"]
357 _LOG.info("Starting prepare stage (creating specific implementation of workflow)")
358 with time_this(
359 log=_LOG,
360 level=logging.INFO,
361 prefix=None,
362 msg="Prepare stage completed",
363 mem_usage=True,
364 mem_unit=DEFAULT_MEM_UNIT,
365 mem_fmt=DEFAULT_MEM_FMT,
366 ):
367 wms_workflow = prepare(generic_workflow_config, generic_workflow, submit_path)
368 if _LOG.isEnabledFor(logging.INFO):
369 _LOG.info(
370 "Peak memory usage for bps process %s (main), %s (largest child process)",
371 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
372 )
374 wms_workflow_config = generic_workflow_config
375 return wms_workflow_config, wms_workflow
378def submit_driver(config_file, **kwargs):
379 """Submit workflow for execution.
381 Parameters
382 ----------
383 config_file : `str`
384 Name of the configuration file.
385 """
386 kwargs.setdefault("runWmsSubmissionChecks", True)
388 _LOG.info(
389 "DISCLAIMER: All values regarding memory consumption reported below are approximate and may "
390 "not accurately reflect actual memory usage by the bps process."
391 )
393 _LOG.info("Starting submission process")
394 with time_this(
395 log=_LOG,
396 level=logging.INFO,
397 prefix=None,
398 msg="Completed entire submission process",
399 mem_usage=True,
400 mem_unit=DEFAULT_MEM_UNIT,
401 mem_fmt=DEFAULT_MEM_FMT,
402 ):
403 wms_workflow_config, wms_workflow = prepare_driver(config_file, **kwargs)
405 _LOG.info("Starting submit stage")
406 with time_this(
407 log=_LOG,
408 level=logging.INFO,
409 prefix=None,
410 msg="Completed submit stage",
411 mem_usage=True,
412 mem_unit=DEFAULT_MEM_UNIT,
413 mem_fmt=DEFAULT_MEM_FMT,
414 ):
415 submit(wms_workflow_config, wms_workflow)
416 _LOG.info("Run '%s' submitted for execution with id '%s'", wms_workflow.name, wms_workflow.run_id)
417 if _LOG.isEnabledFor(logging.INFO):
418 _LOG.info(
419 "Peak memory usage for bps process %s (main), %s (largest child process)",
420 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
421 )
423 print(f"Run Id: {wms_workflow.run_id}")
424 print(f"Run Name: {wms_workflow.name}")
427def restart_driver(wms_service, run_id):
428 """Restart a failed workflow.
430 Parameters
431 ----------
432 wms_service : `str`
433 Name of the class.
434 run_id : `str`
435 Id or path of workflow that need to be restarted.
436 """
437 if wms_service is None:
438 default_config = BpsConfig(BPS_DEFAULTS)
439 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
441 new_run_id, run_name, message = restart(wms_service, run_id)
442 if new_run_id is not None:
443 path = Path(run_id)
444 if path.exists():
445 _dump_env_info(f"{run_id}/{run_name}.env.info.yaml")
446 _dump_pkg_info(f"{run_id}/{run_name}.pkg.info.yaml")
447 print(f"Run Id: {new_run_id}")
448 print(f"Run Name: {run_name}")
449 else:
450 if message:
451 print(f"Restart failed: {message}")
452 else:
453 print("Restart failed: Unknown error")
456def report_driver(wms_service, run_id, user, hist_days, pass_thru, is_global=False):
457 """Print out summary of jobs submitted for execution.
459 Parameters
460 ----------
461 wms_service : `str`
462 Name of the class.
463 run_id : `str`
464 A run id the report will be restricted to.
465 user : `str`
466 A user name the report will be restricted to.
467 hist_days : int
468 Number of days
469 pass_thru : `str`
470 A string to pass directly to the WMS service class.
471 is_global : `bool`, optional
472 If set, all available job queues will be queried for job information.
473 Defaults to False which means that only a local job queue will be
474 queried for information.
476 Only applicable in the context of a WMS using distributed job queues
477 (e.g., HTCondor).
478 """
479 if wms_service is None:
480 default_config = BpsConfig(BPS_DEFAULTS)
481 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
482 report(wms_service, run_id, user, hist_days, pass_thru, is_global=is_global)
485def cancel_driver(wms_service, run_id, user, require_bps, pass_thru, is_global=False):
486 """Cancel submitted workflows.
488 Parameters
489 ----------
490 wms_service : `str`
491 Name of the Workload Management System service class.
492 run_id : `str`
493 ID or path of job that should be canceled.
494 user : `str`
495 User whose submitted jobs should be canceled.
496 require_bps : `bool`
497 Whether to require given run_id/user to be a bps submitted job.
498 pass_thru : `str`
499 Information to pass through to WMS.
500 is_global : `bool`, optional
501 If set, all available job queues will be checked for jobs to cancel.
502 Defaults to False which means that only a local job queue will be
503 checked.
505 Only applicable in the context of a WMS using distributed job queues
506 (e.g., HTCondor).
507 """
508 if wms_service is None:
509 default_config = BpsConfig(BPS_DEFAULTS)
510 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
511 cancel(wms_service, run_id, user, require_bps, pass_thru, is_global=is_global)
514def ping_driver(wms_service=None, pass_thru=None):
515 """Check whether WMS services are up, reachable, and any authentication,
516 if needed, succeeds.
518 The services to be checked are those needed for submit, report, cancel,
519 restart, but ping cannot guarantee whether jobs would actually run
520 successfully.
522 Parameters
523 ----------
524 wms_service : `str`, optional
525 Name of the Workload Management System service class.
526 pass_thru : `str`, optional
527 Information to pass through to WMS.
529 Returns
530 -------
531 success : `int`
532 Whether services are up and usable (0) or not (non-zero).
533 """
534 if wms_service is None:
535 default_config = BpsConfig(BPS_DEFAULTS)
536 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
537 status, message = ping(wms_service, pass_thru)
539 if message:
540 if not status:
541 _LOG.info(message)
542 else:
543 _LOG.error(message)
545 # Log overall status message
546 if not status:
547 _LOG.info("Ping successful.")
548 else:
549 _LOG.error("Ping failed (%d).", status)
551 return status