Coverage for python/lsst/ctrl/bps/drivers.py: 12%
191 statements
« prev ^ index » next coverage.py v7.2.3, created at 2023-04-22 10:19 +0000
« prev ^ index » next coverage.py v7.2.3, created at 2023-04-22 10:19 +0000
1# This file is part of ctrl_bps.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Driver functions for each subcommand.
24Driver functions ensure that ensure all setup work is done before running
25the subcommand method.
26"""
29__all__ = [
30 "acquire_qgraph_driver",
31 "cluster_qgraph_driver",
32 "transform_driver",
33 "prepare_driver",
34 "submit_driver",
35 "report_driver",
36 "restart_driver",
37 "cancel_driver",
38 "ping_driver",
39]
42import errno
43import getpass
44import logging
45import os
46import re
47import shutil
48from collections.abc import Iterable
49from pathlib import Path
51from lsst.pipe.base import Instrument
52from lsst.utils import doImport
53from lsst.utils.timer import time_this
54from lsst.utils.usage import get_peak_mem_usage
56from . import BPS_DEFAULTS, BPS_SEARCH_ORDER, DEFAULT_MEM_FMT, DEFAULT_MEM_UNIT, BpsConfig
57from .bps_utils import _dump_env_info, _dump_pkg_info
58from .cancel import cancel
59from .ping import ping
60from .pre_transform import acquire_quantum_graph, cluster_quanta
61from .prepare import prepare
62from .report import report
63from .restart import restart
64from .submit import submit
65from .transform import transform
67_LOG = logging.getLogger(__name__)
70def _init_submission_driver(config_file, **kwargs):
71 """Initialize runtime environment.
73 Parameters
74 ----------
75 config_file : `str`
76 Name of the configuration file.
78 Returns
79 -------
80 config : `lsst.ctrl.bps.BpsConfig`
81 Batch Processing Service configuration.
82 """
83 config = BpsConfig(config_file, BPS_SEARCH_ORDER)
85 # Override config with command-line values.
86 # Handle diffs between pipetask argument names vs bps yaml
87 translation = {
88 "input": "inCollection",
89 "output_run": "outputRun",
90 "qgraph": "qgraphFile",
91 "pipeline": "pipelineYaml",
92 "wms_service": "wmsServiceClass",
93 "compute_site": "computeSite",
94 }
95 for key, value in kwargs.items():
96 # Don't want to override config with None or empty string values.
97 if value:
98 # pipetask argument parser converts some values to list,
99 # but bps will want string.
100 if not isinstance(value, str) and isinstance(value, Iterable):
101 value = ",".join(value)
102 new_key = translation.get(key, re.sub(r"_(\S)", lambda match: match.group(1).upper(), key))
103 config[f".bps_cmdline.{new_key}"] = value
105 # If the WMS service class was not defined neither at the command line nor
106 # explicitly in config file, use the value provided by the environmental
107 # variable BPS_WMS_SERVICE_CLASS. If the variable is not set, stick to
108 # the package default.
109 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", None)
110 if wms_service is not None and "wmsServiceClass" not in config[".bps_cmdline"]:
111 default_config = BpsConfig(BPS_DEFAULTS)
112 if config["wmsServiceClass"] == default_config["wmsServiceClass"]:
113 config["wmsServiceClass"] = wms_service
115 # Set some initial values
116 config[".bps_defined.timestamp"] = Instrument.makeCollectionTimestamp()
117 if "operator" not in config:
118 config[".bps_defined.operator"] = getpass.getuser()
120 if "outCollection" in config:
121 raise KeyError("outCollection is deprecated. Replace all outCollection references with outputRun.")
123 if "outputRun" not in config:
124 raise KeyError("Must specify the output run collection using outputRun")
126 if "uniqProcName" not in config:
127 config[".bps_defined.uniqProcName"] = config["outputRun"].replace("/", "_")
129 if "submitPath" not in config:
130 raise KeyError("Must specify the submit-side run directory using submitPath")
132 # If requested, run WMS plugin checks early in submission process to
133 # ensure WMS has what it will need for prepare() or submit().
134 if kwargs.get("runWmsSubmissionChecks", False):
135 found, wms_class = config.search("wmsServiceClass")
136 if not found:
137 raise KeyError("Missing wmsServiceClass in bps config. Aborting.")
139 # Check that can import wms service class.
140 wms_service_class = doImport(wms_class)
141 wms_service = wms_service_class(config)
143 try:
144 wms_service.run_submission_checks()
145 except NotImplementedError:
146 # Allow various plugins to implement only when needed to do extra
147 # checks.
148 _LOG.debug("run_submission_checks is not implemented in %s.", wms_class)
149 else:
150 _LOG.debug("Skipping submission checks.")
152 # Make submit directory to contain all outputs.
153 submit_path = Path(config["submitPath"])
154 try:
155 submit_path.mkdir(parents=True, exist_ok=False)
156 except OSError as exc:
157 if exc.errno == errno.EEXIST:
158 reason = "Directory already exists"
159 else:
160 reason = exc.strerror
161 raise type(exc)(f"cannot create submit directory '{submit_path}': {reason}") from None
162 config[".bps_defined.submitPath"] = str(submit_path)
163 print(f"Submit dir: {submit_path}")
165 # save copy of configs (orig and expanded config)
166 shutil.copy2(config_file, submit_path)
167 with open(f"{submit_path}/{config['uniqProcName']}_config.yaml", "w") as fh:
168 config.dump(fh)
170 # Dump information about runtime environment and software versions in use.
171 _dump_env_info(f"{submit_path}/{config['uniqProcName']}.env.info.yaml")
172 _dump_pkg_info(f"{submit_path}/{config['uniqProcName']}.pkg.info.yaml")
174 return config
177def acquire_qgraph_driver(config_file, **kwargs):
178 """Read a quantum graph from a file or create one from pipeline definition.
180 Parameters
181 ----------
182 config_file : `str`
183 Name of the configuration file.
185 Returns
186 -------
187 config : `lsst.ctrl.bps.BpsConfig`
188 Updated configuration.
189 qgraph : `lsst.pipe.base.graph.QuantumGraph`
190 A graph representing quanta.
191 """
192 _LOG.info("Initializing execution environment")
193 with time_this(
194 log=_LOG,
195 level=logging.INFO,
196 prefix=None,
197 msg="Initializing execution environment completed",
198 mem_usage=True,
199 mem_unit=DEFAULT_MEM_UNIT,
200 mem_fmt=DEFAULT_MEM_FMT,
201 ):
202 config = _init_submission_driver(config_file, **kwargs)
203 submit_path = config[".bps_defined.submitPath"]
204 if _LOG.isEnabledFor(logging.INFO):
205 _LOG.info(
206 "Peak memory usage for bps process %s (main), %s (largest child process)",
207 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
208 )
210 _LOG.info("Starting acquire stage (generating and/or reading quantum graph)")
211 with time_this(
212 log=_LOG,
213 level=logging.INFO,
214 prefix=None,
215 msg="Acquire stage completed",
216 mem_usage=True,
217 mem_unit=DEFAULT_MEM_UNIT,
218 mem_fmt=DEFAULT_MEM_FMT,
219 ):
220 qgraph_file, qgraph, execution_butler_dir = acquire_quantum_graph(config, out_prefix=submit_path)
221 if _LOG.isEnabledFor(logging.INFO):
222 _LOG.info(
223 "Peak memory usage for bps process %s (main), %s (largest child process)",
224 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
225 )
227 config[".bps_defined.executionButlerDir"] = execution_butler_dir
228 config[".bps_defined.runQgraphFile"] = qgraph_file
229 return config, qgraph
232def cluster_qgraph_driver(config_file, **kwargs):
233 """Group quanta into clusters.
235 Parameters
236 ----------
237 config_file : `str`
238 Name of the configuration file.
240 Returns
241 -------
242 config : `lsst.ctrl.bps.BpsConfig`
243 Updated configuration.
244 clustered_qgraph : `lsst.ctrl.bps.ClusteredQuantumGraph`
245 A graph representing clustered quanta.
246 """
247 config, qgraph = acquire_qgraph_driver(config_file, **kwargs)
249 _LOG.info("Starting cluster stage (grouping quanta into jobs)")
250 with time_this(
251 log=_LOG,
252 level=logging.INFO,
253 prefix=None,
254 msg="Cluster stage completed",
255 mem_usage=True,
256 mem_unit=DEFAULT_MEM_UNIT,
257 mem_fmt=DEFAULT_MEM_FMT,
258 ):
259 clustered_qgraph = cluster_quanta(config, qgraph, config["uniqProcName"])
260 if _LOG.isEnabledFor(logging.INFO):
261 _LOG.info(
262 "Peak memory usage for bps process %s (main), %s (largest child process)",
263 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
264 )
265 _LOG.info("ClusteredQuantumGraph contains %d cluster(s)", len(clustered_qgraph))
267 submit_path = config[".bps_defined.submitPath"]
268 _, save_clustered_qgraph = config.search("saveClusteredQgraph", opt={"default": False})
269 if save_clustered_qgraph:
270 clustered_qgraph.save(os.path.join(submit_path, "bps_clustered_qgraph.pickle"))
271 _, save_dot = config.search("saveDot", opt={"default": False})
272 if save_dot:
273 clustered_qgraph.draw(os.path.join(submit_path, "bps_clustered_qgraph.dot"))
274 return config, clustered_qgraph
277def transform_driver(config_file, **kwargs):
278 """Create a workflow for a specific workflow management system.
280 Parameters
281 ----------
282 config_file : `str`
283 Name of the configuration file.
285 Returns
286 -------
287 generic_workflow_config : `lsst.ctrl.bps.BpsConfig`
288 Configuration to use when creating the workflow.
289 generic_workflow : `lsst.ctrl.bps.BaseWmsWorkflow`
290 Representation of the abstract/scientific workflow specific to a given
291 workflow management system.
292 """
293 config, clustered_qgraph = cluster_qgraph_driver(config_file, **kwargs)
294 submit_path = config[".bps_defined.submitPath"]
296 _LOG.info("Starting transform stage (creating generic workflow)")
297 with time_this(
298 log=_LOG,
299 level=logging.INFO,
300 prefix=None,
301 msg="Transform stage completed",
302 mem_usage=True,
303 mem_unit=DEFAULT_MEM_UNIT,
304 mem_fmt=DEFAULT_MEM_FMT,
305 ):
306 generic_workflow, generic_workflow_config = transform(config, clustered_qgraph, submit_path)
307 _LOG.info("Generic workflow name '%s'", generic_workflow.name)
308 if _LOG.isEnabledFor(logging.INFO):
309 _LOG.info(
310 "Peak memory usage for bps process %s (main), %s (largest child process)",
311 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
312 )
313 num_jobs = sum(generic_workflow.job_counts.values())
314 _LOG.info("GenericWorkflow contains %d job(s) (including final)", num_jobs)
316 _, save_workflow = config.search("saveGenericWorkflow", opt={"default": False})
317 if save_workflow:
318 with open(os.path.join(submit_path, "bps_generic_workflow.pickle"), "wb") as outfh:
319 generic_workflow.save(outfh, "pickle")
320 _, save_dot = config.search("saveDot", opt={"default": False})
321 if save_dot:
322 with open(os.path.join(submit_path, "bps_generic_workflow.dot"), "w") as outfh:
323 generic_workflow.draw(outfh, "dot")
324 return generic_workflow_config, generic_workflow
327def prepare_driver(config_file, **kwargs):
328 """Create a representation of the generic workflow.
330 Parameters
331 ----------
332 config_file : `str`
333 Name of the configuration file.
335 Returns
336 -------
337 wms_config : `lsst.ctrl.bps.BpsConfig`
338 Configuration to use when creating the workflow.
339 workflow : `lsst.ctrl.bps.BaseWmsWorkflow`
340 Representation of the abstract/scientific workflow specific to a given
341 workflow management system.
342 """
343 kwargs.setdefault("runWmsSubmissionChecks", True)
344 generic_workflow_config, generic_workflow = transform_driver(config_file, **kwargs)
345 submit_path = generic_workflow_config[".bps_defined.submitPath"]
347 _LOG.info("Starting prepare stage (creating specific implementation of workflow)")
348 with time_this(
349 log=_LOG,
350 level=logging.INFO,
351 prefix=None,
352 msg="Prepare stage completed",
353 mem_usage=True,
354 mem_unit=DEFAULT_MEM_UNIT,
355 mem_fmt=DEFAULT_MEM_FMT,
356 ):
357 wms_workflow = prepare(generic_workflow_config, generic_workflow, submit_path)
358 if _LOG.isEnabledFor(logging.INFO):
359 _LOG.info(
360 "Peak memory usage for bps process %s (main), %s (largest child process)",
361 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
362 )
364 wms_workflow_config = generic_workflow_config
365 return wms_workflow_config, wms_workflow
368def submit_driver(config_file, **kwargs):
369 """Submit workflow for execution.
371 Parameters
372 ----------
373 config_file : `str`
374 Name of the configuration file.
375 """
376 kwargs.setdefault("runWmsSubmissionChecks", True)
378 _LOG.info(
379 "DISCLAIMER: All values regarding memory consumption reported below are approximate and may "
380 "not accurately reflect actual memory usage by the bps process."
381 )
383 _LOG.info("Starting submission process")
384 with time_this(
385 log=_LOG,
386 level=logging.INFO,
387 prefix=None,
388 msg="Completed entire submission process",
389 mem_usage=True,
390 mem_unit=DEFAULT_MEM_UNIT,
391 mem_fmt=DEFAULT_MEM_FMT,
392 ):
393 wms_workflow_config, wms_workflow = prepare_driver(config_file, **kwargs)
395 _LOG.info("Starting submit stage")
396 with time_this(
397 log=_LOG,
398 level=logging.INFO,
399 prefix=None,
400 msg="Completed submit stage",
401 mem_usage=True,
402 mem_unit=DEFAULT_MEM_UNIT,
403 mem_fmt=DEFAULT_MEM_FMT,
404 ):
405 submit(wms_workflow_config, wms_workflow)
406 _LOG.info("Run '%s' submitted for execution with id '%s'", wms_workflow.name, wms_workflow.run_id)
407 if _LOG.isEnabledFor(logging.INFO):
408 _LOG.info(
409 "Peak memory usage for bps process %s (main), %s (largest child process)",
410 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
411 )
413 print(f"Run Id: {wms_workflow.run_id}")
414 print(f"Run Name: {wms_workflow.name}")
417def restart_driver(wms_service, run_id):
418 """Restart a failed workflow.
420 Parameters
421 ----------
422 wms_service : `str`
423 Name of the class.
424 run_id : `str`
425 Id or path of workflow that need to be restarted.
426 """
427 if wms_service is None:
428 default_config = BpsConfig(BPS_DEFAULTS)
429 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
431 new_run_id, run_name, message = restart(wms_service, run_id)
432 if new_run_id is not None:
433 path = Path(run_id)
434 if path.exists():
435 _dump_env_info(f"{run_id}/{run_name}.env.info.yaml")
436 _dump_pkg_info(f"{run_id}/{run_name}.pkg.info.yaml")
437 print(f"Run Id: {new_run_id}")
438 print(f"Run Name: {run_name}")
439 else:
440 if message:
441 print(f"Restart failed: {message}")
442 else:
443 print("Restart failed: Unknown error")
446def report_driver(wms_service, run_id, user, hist_days, pass_thru, is_global=False):
447 """Print out summary of jobs submitted for execution.
449 Parameters
450 ----------
451 wms_service : `str`
452 Name of the class.
453 run_id : `str`
454 A run id the report will be restricted to.
455 user : `str`
456 A user name the report will be restricted to.
457 hist_days : int
458 Number of days
459 pass_thru : `str`
460 A string to pass directly to the WMS service class.
461 is_global : `bool`, optional
462 If set, all available job queues will be queried for job information.
463 Defaults to False which means that only a local job queue will be
464 queried for information.
466 Only applicable in the context of a WMS using distributed job queues
467 (e.g., HTCondor).
468 """
469 if wms_service is None:
470 default_config = BpsConfig(BPS_DEFAULTS)
471 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
472 report(wms_service, run_id, user, hist_days, pass_thru, is_global=is_global)
475def cancel_driver(wms_service, run_id, user, require_bps, pass_thru, is_global=False):
476 """Cancel submitted workflows.
478 Parameters
479 ----------
480 wms_service : `str`
481 Name of the Workload Management System service class.
482 run_id : `str`
483 ID or path of job that should be canceled.
484 user : `str`
485 User whose submitted jobs should be canceled.
486 require_bps : `bool`
487 Whether to require given run_id/user to be a bps submitted job.
488 pass_thru : `str`
489 Information to pass through to WMS.
490 is_global : `bool`, optional
491 If set, all available job queues will be checked for jobs to cancel.
492 Defaults to False which means that only a local job queue will be
493 checked.
495 Only applicable in the context of a WMS using distributed job queues
496 (e.g., HTCondor).
497 """
498 if wms_service is None:
499 default_config = BpsConfig(BPS_DEFAULTS)
500 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
501 cancel(wms_service, run_id, user, require_bps, pass_thru, is_global=is_global)
504def ping_driver(wms_service=None, pass_thru=None):
505 """Checks whether WMS services are up, reachable, and any authentication,
506 if needed, succeeds.
508 The services to be checked are those needed for submit, report, cancel,
509 restart, but ping cannot guarantee whether jobs would actually run
510 successfully.
512 Parameters
513 ----------
514 wms_service : `str`, optional
515 Name of the Workload Management System service class.
516 pass_thru : `str`, optional
517 Information to pass through to WMS.
519 Returns
520 -------
521 success : `int`
522 Whether services are up and usable (0) or not (non-zero).
523 """
524 if wms_service is None:
525 default_config = BpsConfig(BPS_DEFAULTS)
526 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
527 status, message = ping(wms_service, pass_thru)
529 if message:
530 if not status:
531 _LOG.info(message)
532 else:
533 _LOG.error(message)
535 # Log overall status message
536 if not status:
537 _LOG.info("Ping successful.")
538 else:
539 _LOG.error("Ping failed (%d).", status)
541 return status