Coverage for python/lsst/ctrl/bps/drivers.py: 12%
191 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-11 02:00 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-11 02:00 -0700
1# This file is part of ctrl_bps.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Driver functions for each subcommand.
24Driver functions ensure that ensure all setup work is done before running
25the subcommand method.
26"""
29__all__ = [
30 "acquire_qgraph_driver",
31 "cluster_qgraph_driver",
32 "transform_driver",
33 "prepare_driver",
34 "submit_driver",
35 "report_driver",
36 "restart_driver",
37 "cancel_driver",
38 "ping_driver",
39]
42import errno
43import getpass
44import logging
45import os
46import re
47import shutil
48from collections.abc import Iterable
49from pathlib import Path
51from lsst.pipe.base import Instrument
52from lsst.utils import doImport
53from lsst.utils.timer import time_this
54from lsst.utils.usage import get_peak_mem_usage
56from . import BPS_DEFAULTS, BPS_SEARCH_ORDER, DEFAULT_MEM_FMT, DEFAULT_MEM_UNIT, BpsConfig
57from .bps_utils import _dump_env_info, _dump_pkg_info
58from .cancel import cancel
59from .ping import ping
60from .pre_transform import acquire_quantum_graph, cluster_quanta
61from .prepare import prepare
62from .report import report
63from .restart import restart
64from .submit import submit
65from .transform import transform
67_LOG = logging.getLogger(__name__)
70def _init_submission_driver(config_file, **kwargs):
71 """Initialize runtime environment.
73 Parameters
74 ----------
75 config_file : `str`
76 Name of the configuration file.
78 Returns
79 -------
80 config : `lsst.ctrl.bps.BpsConfig`
81 Batch Processing Service configuration.
82 """
83 config = BpsConfig(config_file, BPS_SEARCH_ORDER)
85 # Override config with command-line values.
86 # Handle diffs between pipetask argument names vs bps yaml
87 translation = {
88 "input": "inCollection",
89 "output_run": "outputRun",
90 "qgraph": "qgraphFile",
91 "pipeline": "pipelineYaml",
92 "wms_service": "wmsServiceClass",
93 }
94 for key, value in kwargs.items():
95 # Don't want to override config with None or empty string values.
96 if value:
97 # pipetask argument parser converts some values to list,
98 # but bps will want string.
99 if not isinstance(value, str) and isinstance(value, Iterable):
100 value = ",".join(value)
101 new_key = translation.get(key, re.sub(r"_(\S)", lambda match: match.group(1).upper(), key))
102 config[f".bps_cmdline.{new_key}"] = value
104 # If the WMS service class was not defined neither at the command line nor
105 # explicitly in config file, use the value provided by the environmental
106 # variable BPS_WMS_SERVICE_CLASS. If the variable is not set, stick to
107 # the package default.
108 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", None)
109 if wms_service is not None and "wmsServiceClass" not in config[".bps_cmdline"]:
110 default_config = BpsConfig(BPS_DEFAULTS)
111 if config["wmsServiceClass"] == default_config["wmsServiceClass"]:
112 config["wmsServiceClass"] = wms_service
114 # Set some initial values
115 config[".bps_defined.timestamp"] = Instrument.makeCollectionTimestamp()
116 if "operator" not in config:
117 config[".bps_defined.operator"] = getpass.getuser()
119 if "outCollection" in config:
120 raise KeyError("outCollection is deprecated. Replace all outCollection references with outputRun.")
122 if "outputRun" not in config:
123 raise KeyError("Must specify the output run collection using outputRun")
125 if "uniqProcName" not in config:
126 config[".bps_defined.uniqProcName"] = config["outputRun"].replace("/", "_")
128 if "submitPath" not in config:
129 raise KeyError("Must specify the submit-side run directory using submitPath")
131 # If requested, run WMS plugin checks early in submission process to
132 # ensure WMS has what it will need for prepare() or submit().
133 if kwargs.get("runWmsSubmissionChecks", False):
134 found, wms_class = config.search("wmsServiceClass")
135 if not found:
136 raise KeyError("Missing wmsServiceClass in bps config. Aborting.")
138 # Check that can import wms service class.
139 wms_service_class = doImport(wms_class)
140 wms_service = wms_service_class(config)
142 try:
143 wms_service.run_submission_checks()
144 except NotImplementedError:
145 # Allow various plugins to implement only when needed to do extra
146 # checks.
147 _LOG.debug("run_submission_checks is not implemented in %s.", wms_class)
148 else:
149 _LOG.debug("Skipping submission checks.")
151 # Make submit directory to contain all outputs.
152 submit_path = Path(config["submitPath"])
153 try:
154 submit_path.mkdir(parents=True, exist_ok=False)
155 except OSError as exc:
156 if exc.errno == errno.EEXIST:
157 reason = "Directory already exists"
158 else:
159 reason = exc.strerror
160 raise type(exc)(f"cannot create submit directory '{submit_path}': {reason}") from None
161 config[".bps_defined.submitPath"] = str(submit_path)
162 print(f"Submit dir: {submit_path}")
164 # save copy of configs (orig and expanded config)
165 shutil.copy2(config_file, submit_path)
166 with open(f"{submit_path}/{config['uniqProcName']}_config.yaml", "w") as fh:
167 config.dump(fh)
169 # Dump information about runtime environment and software versions in use.
170 _dump_env_info(f"{submit_path}/{config['uniqProcName']}.env.info.yaml")
171 _dump_pkg_info(f"{submit_path}/{config['uniqProcName']}.pkg.info.yaml")
173 return config
176def acquire_qgraph_driver(config_file, **kwargs):
177 """Read a quantum graph from a file or create one from pipeline definition.
179 Parameters
180 ----------
181 config_file : `str`
182 Name of the configuration file.
184 Returns
185 -------
186 config : `lsst.ctrl.bps.BpsConfig`
187 Updated configuration.
188 qgraph : `lsst.pipe.base.graph.QuantumGraph`
189 A graph representing quanta.
190 """
191 _LOG.info("Initializing execution environment")
192 with time_this(
193 log=_LOG,
194 level=logging.INFO,
195 prefix=None,
196 msg="Initializing execution environment completed",
197 mem_usage=True,
198 mem_unit=DEFAULT_MEM_UNIT,
199 mem_fmt=DEFAULT_MEM_FMT,
200 ):
201 config = _init_submission_driver(config_file, **kwargs)
202 submit_path = config[".bps_defined.submitPath"]
203 if _LOG.isEnabledFor(logging.INFO):
204 _LOG.info(
205 "Peak memory usage for bps process %s (main), %s (largest child process)",
206 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
207 )
209 _LOG.info("Starting acquire stage (generating and/or reading quantum graph)")
210 with time_this(
211 log=_LOG,
212 level=logging.INFO,
213 prefix=None,
214 msg="Acquire stage completed",
215 mem_usage=True,
216 mem_unit=DEFAULT_MEM_UNIT,
217 mem_fmt=DEFAULT_MEM_FMT,
218 ):
219 qgraph_file, qgraph, execution_butler_dir = acquire_quantum_graph(config, out_prefix=submit_path)
220 if _LOG.isEnabledFor(logging.INFO):
221 _LOG.info(
222 "Peak memory usage for bps process %s (main), %s (largest child process)",
223 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
224 )
226 config[".bps_defined.executionButlerDir"] = execution_butler_dir
227 config[".bps_defined.runQgraphFile"] = qgraph_file
228 return config, qgraph
231def cluster_qgraph_driver(config_file, **kwargs):
232 """Group quanta into clusters.
234 Parameters
235 ----------
236 config_file : `str`
237 Name of the configuration file.
239 Returns
240 -------
241 config : `lsst.ctrl.bps.BpsConfig`
242 Updated configuration.
243 clustered_qgraph : `lsst.ctrl.bps.ClusteredQuantumGraph`
244 A graph representing clustered quanta.
245 """
246 config, qgraph = acquire_qgraph_driver(config_file, **kwargs)
248 _LOG.info("Starting cluster stage (grouping quanta into jobs)")
249 with time_this(
250 log=_LOG,
251 level=logging.INFO,
252 prefix=None,
253 msg="Cluster stage completed",
254 mem_usage=True,
255 mem_unit=DEFAULT_MEM_UNIT,
256 mem_fmt=DEFAULT_MEM_FMT,
257 ):
258 clustered_qgraph = cluster_quanta(config, qgraph, config["uniqProcName"])
259 if _LOG.isEnabledFor(logging.INFO):
260 _LOG.info(
261 "Peak memory usage for bps process %s (main), %s (largest child process)",
262 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
263 )
264 _LOG.info("ClusteredQuantumGraph contains %d cluster(s)", len(clustered_qgraph))
266 submit_path = config[".bps_defined.submitPath"]
267 _, save_clustered_qgraph = config.search("saveClusteredQgraph", opt={"default": False})
268 if save_clustered_qgraph:
269 clustered_qgraph.save(os.path.join(submit_path, "bps_clustered_qgraph.pickle"))
270 _, save_dot = config.search("saveDot", opt={"default": False})
271 if save_dot:
272 clustered_qgraph.draw(os.path.join(submit_path, "bps_clustered_qgraph.dot"))
273 return config, clustered_qgraph
276def transform_driver(config_file, **kwargs):
277 """Create a workflow for a specific workflow management system.
279 Parameters
280 ----------
281 config_file : `str`
282 Name of the configuration file.
284 Returns
285 -------
286 generic_workflow_config : `lsst.ctrl.bps.BpsConfig`
287 Configuration to use when creating the workflow.
288 generic_workflow : `lsst.ctrl.bps.BaseWmsWorkflow`
289 Representation of the abstract/scientific workflow specific to a given
290 workflow management system.
291 """
292 config, clustered_qgraph = cluster_qgraph_driver(config_file, **kwargs)
293 submit_path = config[".bps_defined.submitPath"]
295 _LOG.info("Starting transform stage (creating generic workflow)")
296 with time_this(
297 log=_LOG,
298 level=logging.INFO,
299 prefix=None,
300 msg="Transform stage completed",
301 mem_usage=True,
302 mem_unit=DEFAULT_MEM_UNIT,
303 mem_fmt=DEFAULT_MEM_FMT,
304 ):
305 generic_workflow, generic_workflow_config = transform(config, clustered_qgraph, submit_path)
306 _LOG.info("Generic workflow name '%s'", generic_workflow.name)
307 if _LOG.isEnabledFor(logging.INFO):
308 _LOG.info(
309 "Peak memory usage for bps process %s (main), %s (largest child process)",
310 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
311 )
312 num_jobs = sum(generic_workflow.job_counts.values())
313 _LOG.info("GenericWorkflow contains %d job(s) (including final)", num_jobs)
315 _, save_workflow = config.search("saveGenericWorkflow", opt={"default": False})
316 if save_workflow:
317 with open(os.path.join(submit_path, "bps_generic_workflow.pickle"), "wb") as outfh:
318 generic_workflow.save(outfh, "pickle")
319 _, save_dot = config.search("saveDot", opt={"default": False})
320 if save_dot:
321 with open(os.path.join(submit_path, "bps_generic_workflow.dot"), "w") as outfh:
322 generic_workflow.draw(outfh, "dot")
323 return generic_workflow_config, generic_workflow
326def prepare_driver(config_file, **kwargs):
327 """Create a representation of the generic workflow.
329 Parameters
330 ----------
331 config_file : `str`
332 Name of the configuration file.
334 Returns
335 -------
336 wms_config : `lsst.ctrl.bps.BpsConfig`
337 Configuration to use when creating the workflow.
338 workflow : `lsst.ctrl.bps.BaseWmsWorkflow`
339 Representation of the abstract/scientific workflow specific to a given
340 workflow management system.
341 """
342 kwargs.setdefault("runWmsSubmissionChecks", True)
343 generic_workflow_config, generic_workflow = transform_driver(config_file, **kwargs)
344 submit_path = generic_workflow_config[".bps_defined.submitPath"]
346 _LOG.info("Starting prepare stage (creating specific implementation of workflow)")
347 with time_this(
348 log=_LOG,
349 level=logging.INFO,
350 prefix=None,
351 msg="Prepare stage completed",
352 mem_usage=True,
353 mem_unit=DEFAULT_MEM_UNIT,
354 mem_fmt=DEFAULT_MEM_FMT,
355 ):
356 wms_workflow = prepare(generic_workflow_config, generic_workflow, submit_path)
357 if _LOG.isEnabledFor(logging.INFO):
358 _LOG.info(
359 "Peak memory usage for bps process %s (main), %s (largest child process)",
360 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
361 )
363 wms_workflow_config = generic_workflow_config
364 return wms_workflow_config, wms_workflow
367def submit_driver(config_file, **kwargs):
368 """Submit workflow for execution.
370 Parameters
371 ----------
372 config_file : `str`
373 Name of the configuration file.
374 """
375 kwargs.setdefault("runWmsSubmissionChecks", True)
377 _LOG.info(
378 "DISCLAIMER: All values regarding memory consumption reported below are approximate and may "
379 "not accurately reflect actual memory usage by the bps process."
380 )
382 _LOG.info("Starting submission process")
383 with time_this(
384 log=_LOG,
385 level=logging.INFO,
386 prefix=None,
387 msg="Completed entire submission process",
388 mem_usage=True,
389 mem_unit=DEFAULT_MEM_UNIT,
390 mem_fmt=DEFAULT_MEM_FMT,
391 ):
392 wms_workflow_config, wms_workflow = prepare_driver(config_file, **kwargs)
394 _LOG.info("Starting submit stage")
395 with time_this(
396 log=_LOG,
397 level=logging.INFO,
398 prefix=None,
399 msg="Completed submit stage",
400 mem_usage=True,
401 mem_unit=DEFAULT_MEM_UNIT,
402 mem_fmt=DEFAULT_MEM_FMT,
403 ):
404 submit(wms_workflow_config, wms_workflow)
405 _LOG.info("Run '%s' submitted for execution with id '%s'", wms_workflow.name, wms_workflow.run_id)
406 if _LOG.isEnabledFor(logging.INFO):
407 _LOG.info(
408 "Peak memory usage for bps process %s (main), %s (largest child process)",
409 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
410 )
412 print(f"Run Id: {wms_workflow.run_id}")
413 print(f"Run Name: {wms_workflow.name}")
416def restart_driver(wms_service, run_id):
417 """Restart a failed workflow.
419 Parameters
420 ----------
421 wms_service : `str`
422 Name of the class.
423 run_id : `str`
424 Id or path of workflow that need to be restarted.
425 """
426 if wms_service is None:
427 default_config = BpsConfig(BPS_DEFAULTS)
428 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
430 new_run_id, run_name, message = restart(wms_service, run_id)
431 if new_run_id is not None:
432 path = Path(run_id)
433 if path.exists():
434 _dump_env_info(f"{run_id}/{run_name}.env.info.yaml")
435 _dump_pkg_info(f"{run_id}/{run_name}.pkg.info.yaml")
436 print(f"Run Id: {new_run_id}")
437 print(f"Run Name: {run_name}")
438 else:
439 if message:
440 print(f"Restart failed: {message}")
441 else:
442 print("Restart failed: Unknown error")
445def report_driver(wms_service, run_id, user, hist_days, pass_thru, is_global=False):
446 """Print out summary of jobs submitted for execution.
448 Parameters
449 ----------
450 wms_service : `str`
451 Name of the class.
452 run_id : `str`
453 A run id the report will be restricted to.
454 user : `str`
455 A user name the report will be restricted to.
456 hist_days : int
457 Number of days
458 pass_thru : `str`
459 A string to pass directly to the WMS service class.
460 is_global : `bool`, optional
461 If set, all available job queues will be queried for job information.
462 Defaults to False which means that only a local job queue will be
463 queried for information.
465 Only applicable in the context of a WMS using distributed job queues
466 (e.g., HTCondor).
467 """
468 if wms_service is None:
469 default_config = BpsConfig(BPS_DEFAULTS)
470 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
471 report(wms_service, run_id, user, hist_days, pass_thru, is_global=is_global)
474def cancel_driver(wms_service, run_id, user, require_bps, pass_thru, is_global=False):
475 """Cancel submitted workflows.
477 Parameters
478 ----------
479 wms_service : `str`
480 Name of the Workload Management System service class.
481 run_id : `str`
482 ID or path of job that should be canceled.
483 user : `str`
484 User whose submitted jobs should be canceled.
485 require_bps : `bool`
486 Whether to require given run_id/user to be a bps submitted job.
487 pass_thru : `str`
488 Information to pass through to WMS.
489 is_global : `bool`, optional
490 If set, all available job queues will be checked for jobs to cancel.
491 Defaults to False which means that only a local job queue will be
492 checked.
494 Only applicable in the context of a WMS using distributed job queues
495 (e.g., HTCondor).
496 """
497 if wms_service is None:
498 default_config = BpsConfig(BPS_DEFAULTS)
499 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
500 cancel(wms_service, run_id, user, require_bps, pass_thru, is_global=is_global)
503def ping_driver(wms_service=None, pass_thru=None):
504 """Checks whether WMS services are up, reachable, and any authentication,
505 if needed, succeeds.
507 The services to be checked are those needed for submit, report, cancel,
508 restart, but ping cannot guarantee whether jobs would actually run
509 successfully.
511 Parameters
512 ----------
513 wms_service : `str`, optional
514 Name of the Workload Management System service class.
515 pass_thru : `str`, optional
516 Information to pass through to WMS.
518 Returns
519 -------
520 success : `int`
521 Whether services are up and usable (0) or not (non-zero).
522 """
523 if wms_service is None:
524 default_config = BpsConfig(BPS_DEFAULTS)
525 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
526 status, message = ping(wms_service, pass_thru)
528 if message:
529 if not status:
530 _LOG.info(message)
531 else:
532 _LOG.error(message)
534 # Log overall status message
535 if not status:
536 _LOG.info("Ping successful.")
537 else:
538 _LOG.error("Ping failed (%d).", status)
540 return status