Coverage for python/lsst/ctrl/bps/drivers.py: 13%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_bps.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Driver functions for each subcommand.
24Driver functions ensure that ensure all setup work is done before running
25the subcommand method.
26"""
29__all__ = [
30 "acquire_qgraph_driver",
31 "cluster_qgraph_driver",
32 "transform_driver",
33 "prepare_driver",
34 "submit_driver",
35 "report_driver",
36 "restart_driver",
37 "cancel_driver",
38]
41import errno
42import getpass
43import logging
44import os
45import re
46import shutil
47from collections.abc import Iterable
48from pathlib import Path
50from lsst.obs.base import Instrument
51from lsst.utils import doImport
52from lsst.utils.timer import time_this
53from lsst.utils.usage import get_peak_mem_usage
55from . import BPS_DEFAULTS, BPS_SEARCH_ORDER, BpsConfig, DEFAULT_MEM_UNIT, DEFAULT_MEM_FMT
56from .pre_transform import acquire_quantum_graph, cluster_quanta
57from .transform import transform
58from .prepare import prepare
59from .submit import submit
60from .cancel import cancel
61from .report import report
62from .restart import restart
63from .bps_utils import _dump_env_info, _dump_pkg_info
66_LOG = logging.getLogger(__name__)
69def _init_submission_driver(config_file, **kwargs):
70 """Initialize runtime environment.
72 Parameters
73 ----------
74 config_file : `str`
75 Name of the configuration file.
77 Returns
78 -------
79 config : `lsst.ctrl.bps.BpsConfig`
80 Batch Processing Service configuration.
81 """
82 config = BpsConfig(config_file, BPS_SEARCH_ORDER)
84 # Override config with command-line values.
85 # Handle diffs between pipetask argument names vs bps yaml
86 translation = {
87 "input": "inCollection",
88 "output_run": "outputRun",
89 "qgraph": "qgraphFile",
90 "pipeline": "pipelineYaml",
91 "wms_service": "wmsServiceClass",
92 }
93 for key, value in kwargs.items():
94 # Don't want to override config with None or empty string values.
95 if value:
96 # pipetask argument parser converts some values to list,
97 # but bps will want string.
98 if not isinstance(value, str) and isinstance(value, Iterable):
99 value = ",".join(value)
100 new_key = translation.get(key, re.sub(r"_(\S)", lambda match: match.group(1).upper(), key))
101 config[f".bps_cmdline.{new_key}"] = value
103 # If the WMS service class was not defined neither at the command line nor
104 # explicitly in config file, use the value provided by the environmental
105 # variable BPS_WMS_SERVICE_CLASS. If the variable is not set, stick to
106 # the package default.
107 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", None)
108 if wms_service is not None and "wmsServiceClass" not in config[".bps_cmdline"]:
109 default_config = BpsConfig(BPS_DEFAULTS)
110 if config["wmsServiceClass"] == default_config["wmsServiceClass"]:
111 config["wmsServiceClass"] = wms_service
113 # Set some initial values
114 config[".bps_defined.timestamp"] = Instrument.makeCollectionTimestamp()
115 if "operator" not in config:
116 config[".bps_defined.operator"] = getpass.getuser()
118 if "outCollection" in config:
119 raise KeyError("outCollection is deprecated. Replace all outCollection references with outputRun.")
121 if "outputRun" not in config:
122 raise KeyError("Must specify the output run collection using outputRun")
124 if "uniqProcName" not in config:
125 config[".bps_defined.uniqProcName"] = config["outputRun"].replace("/", "_")
127 if "submitPath" not in config:
128 raise KeyError("Must specify the submit-side run directory using submitPath")
130 # If requested, run WMS plugin checks early in submission process to
131 # ensure WMS has what it will need for prepare() or submit().
132 if kwargs.get("runWmsSubmissionChecks", False):
133 found, wms_class = config.search("wmsServiceClass")
134 if not found:
135 raise KeyError("Missing wmsServiceClass in bps config. Aborting.")
137 # Check that can import wms service class.
138 wms_service_class = doImport(wms_class)
139 wms_service = wms_service_class(config)
141 try:
142 wms_service.run_submission_checks()
143 except NotImplementedError:
144 # Allow various plugins to implement only when needed to do extra
145 # checks.
146 _LOG.debug("run_submission_checks is not implemented in %s.", wms_class)
147 else:
148 _LOG.debug("Skipping submission checks.")
150 # Make submit directory to contain all outputs.
151 submit_path = Path(config["submitPath"])
152 try:
153 submit_path.mkdir(parents=True, exist_ok=False)
154 except OSError as exc:
155 if exc.errno == errno.EEXIST:
156 reason = "Directory already exists"
157 else:
158 reason = exc.strerror
159 raise type(exc)(f"cannot create submit directory '{submit_path}': {reason}") from None
160 config[".bps_defined.submitPath"] = str(submit_path)
161 print(f"Submit dir: {submit_path}")
163 # save copy of configs (orig and expanded config)
164 shutil.copy2(config_file, submit_path)
165 with open(f"{submit_path}/{config['uniqProcName']}_config.yaml", "w") as fh:
166 config.dump(fh)
168 # Dump information about runtime environment and software versions in use.
169 _dump_env_info(f"{submit_path}/{config['uniqProcName']}.env.info.yaml")
170 _dump_pkg_info(f"{submit_path}/{config['uniqProcName']}.pkg.info.yaml")
172 return config
175def acquire_qgraph_driver(config_file, **kwargs):
176 """Read a quantum graph from a file or create one from pipeline definition.
178 Parameters
179 ----------
180 config_file : `str`
181 Name of the configuration file.
183 Returns
184 -------
185 config : `lsst.ctrl.bps.BpsConfig`
186 Updated configuration.
187 qgraph : `lsst.pipe.base.graph.QuantumGraph`
188 A graph representing quanta.
189 """
190 _LOG.info("Initializing execution environment")
191 with time_this(log=_LOG, level=logging.INFO, prefix=None,
192 msg="Initializing execution environment completed", mem_usage=True,
193 mem_unit=DEFAULT_MEM_UNIT, mem_fmt=DEFAULT_MEM_FMT):
194 config = _init_submission_driver(config_file, **kwargs)
195 submit_path = config[".bps_defined.submitPath"]
196 if _LOG.isEnabledFor(logging.INFO):
197 _LOG.info("Peak memory usage for bps process %s (main), %s (largest child process)",
198 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()))
200 _LOG.info("Starting acquire stage (generating and/or reading quantum graph)")
201 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Acquire stage completed", mem_usage=True,
202 mem_unit=DEFAULT_MEM_UNIT, mem_fmt=DEFAULT_MEM_FMT):
203 qgraph_file, qgraph, execution_butler_dir = acquire_quantum_graph(config, out_prefix=submit_path)
204 if _LOG.isEnabledFor(logging.INFO):
205 _LOG.info("Peak memory usage for bps process %s (main), %s (largest child process)",
206 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()))
208 config[".bps_defined.executionButlerDir"] = execution_butler_dir
209 config[".bps_defined.runQgraphFile"] = qgraph_file
210 return config, qgraph
213def cluster_qgraph_driver(config_file, **kwargs):
214 """Group quanta into clusters.
216 Parameters
217 ----------
218 config_file : `str`
219 Name of the configuration file.
221 Returns
222 -------
223 config : `lsst.ctrl.bps.BpsConfig`
224 Updated configuration.
225 clustered_qgraph : `lsst.ctrl.bps.ClusteredQuantumGraph`
226 A graph representing clustered quanta.
227 """
228 config, qgraph = acquire_qgraph_driver(config_file, **kwargs)
230 _LOG.info("Starting cluster stage (grouping quanta into jobs)")
231 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Cluster stage completed", mem_usage=True,
232 mem_unit=DEFAULT_MEM_UNIT, mem_fmt=DEFAULT_MEM_FMT):
233 clustered_qgraph = cluster_quanta(config, qgraph, config["uniqProcName"])
234 if _LOG.isEnabledFor(logging.INFO):
235 _LOG.info("Peak memory usage for bps process %s (main), %s (largest child process)",
236 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()))
238 submit_path = config[".bps_defined.submitPath"]
239 _, save_clustered_qgraph = config.search("saveClusteredQgraph", opt={"default": False})
240 if save_clustered_qgraph:
241 clustered_qgraph.save(os.path.join(submit_path, "bps_clustered_qgraph.pickle"))
242 _, save_dot = config.search("saveDot", opt={"default": False})
243 if save_dot:
244 clustered_qgraph.draw(os.path.join(submit_path, "bps_clustered_qgraph.dot"))
245 return config, clustered_qgraph
248def transform_driver(config_file, **kwargs):
249 """Create a workflow for a specific workflow management system.
251 Parameters
252 ----------
253 config_file : `str`
254 Name of the configuration file.
256 Returns
257 -------
258 generic_workflow_config : `lsst.ctrl.bps.BpsConfig`
259 Configuration to use when creating the workflow.
260 generic_workflow : `lsst.ctrl.bps.BaseWmsWorkflow`
261 Representation of the abstract/scientific workflow specific to a given
262 workflow management system.
263 """
264 config, clustered_qgraph = cluster_qgraph_driver(config_file, **kwargs)
265 submit_path = config[".bps_defined.submitPath"]
267 _LOG.info("Starting transform stage (creating generic workflow)")
268 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Transform stage completed", mem_usage=True,
269 mem_unit=DEFAULT_MEM_UNIT, mem_fmt=DEFAULT_MEM_FMT):
270 generic_workflow, generic_workflow_config = transform(config, clustered_qgraph, submit_path)
271 _LOG.info("Generic workflow name '%s'", generic_workflow.name)
272 if _LOG.isEnabledFor(logging.INFO):
273 _LOG.info("Peak memory usage for bps process %s (main), %s (largest child process)",
274 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()))
276 _, save_workflow = config.search("saveGenericWorkflow", opt={"default": False})
277 if save_workflow:
278 with open(os.path.join(submit_path, "bps_generic_workflow.pickle"), "wb") as outfh:
279 generic_workflow.save(outfh, "pickle")
280 _, save_dot = config.search("saveDot", opt={"default": False})
281 if save_dot:
282 with open(os.path.join(submit_path, "bps_generic_workflow.dot"), "w") as outfh:
283 generic_workflow.draw(outfh, "dot")
284 return generic_workflow_config, generic_workflow
287def prepare_driver(config_file, **kwargs):
288 """Create a representation of the generic workflow.
290 Parameters
291 ----------
292 config_file : `str`
293 Name of the configuration file.
295 Returns
296 -------
297 wms_config : `lsst.ctrl.bps.BpsConfig`
298 Configuration to use when creating the workflow.
299 workflow : `lsst.ctrl.bps.BaseWmsWorkflow`
300 Representation of the abstract/scientific workflow specific to a given
301 workflow management system.
302 """
303 kwargs.setdefault("runWmsSubmissionChecks", True)
304 generic_workflow_config, generic_workflow = transform_driver(config_file, **kwargs)
305 submit_path = generic_workflow_config[".bps_defined.submitPath"]
307 _LOG.info("Starting prepare stage (creating specific implementation of workflow)")
308 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Prepare stage completed", mem_usage=True,
309 mem_unit=DEFAULT_MEM_UNIT, mem_fmt=DEFAULT_MEM_FMT):
310 wms_workflow = prepare(generic_workflow_config, generic_workflow, submit_path)
311 if _LOG.isEnabledFor(logging.INFO):
312 _LOG.info("Peak memory usage for bps process %s (main), %s (largest child process)",
313 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()))
315 wms_workflow_config = generic_workflow_config
316 return wms_workflow_config, wms_workflow
319def submit_driver(config_file, **kwargs):
320 """Submit workflow for execution.
322 Parameters
323 ----------
324 config_file : `str`
325 Name of the configuration file.
326 """
327 kwargs.setdefault("runWmsSubmissionChecks", True)
329 _LOG.info("DISCLAIMER: All values regarding memory consumption reported below are approximate and may "
330 "not accurately reflect actual memory usage by the bps process.")
332 _LOG.info("Starting submission process")
333 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Completed entire submission process",
334 mem_usage=True, mem_unit=DEFAULT_MEM_UNIT, mem_fmt=DEFAULT_MEM_FMT):
335 wms_workflow_config, wms_workflow = prepare_driver(config_file, **kwargs)
337 _LOG.info("Starting submit stage")
338 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Completed submit stage",
339 mem_usage=True, mem_unit=DEFAULT_MEM_UNIT, mem_fmt=DEFAULT_MEM_FMT):
340 submit(wms_workflow_config, wms_workflow)
341 _LOG.info("Run '%s' submitted for execution with id '%s'", wms_workflow.name, wms_workflow.run_id)
342 if _LOG.isEnabledFor(logging.INFO):
343 _LOG.info("Peak memory usage for bps process %s (main), %s (largest child process)",
344 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()))
346 print(f"Run Id: {wms_workflow.run_id}")
347 print(f"Run Name: {wms_workflow.name}")
350def restart_driver(wms_service, run_id):
351 """Restart a failed workflow.
353 Parameters
354 ----------
355 wms_service : `str`
356 Name of the class.
357 run_id : `str`
358 Id or path of workflow that need to be restarted.
359 """
360 if wms_service is None:
361 default_config = BpsConfig(BPS_DEFAULTS)
362 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
364 new_run_id, run_name, message = restart(wms_service, run_id)
365 if new_run_id is not None:
366 path = Path(run_id)
367 if path.exists():
368 _dump_env_info(f"{run_id}/{run_name}.env.info.yaml")
369 _dump_pkg_info(f"{run_id}/{run_name}.pkg.info.yaml")
370 print(f"Run Id: {new_run_id}")
371 print(f"Run Name: {run_name}")
372 else:
373 if message:
374 print(f"Restart failed: {message}")
375 else:
376 print("Restart failed: Unknown error")
379def report_driver(wms_service, run_id, user, hist_days, pass_thru, is_global=False):
380 """Print out summary of jobs submitted for execution.
382 Parameters
383 ----------
384 wms_service : `str`
385 Name of the class.
386 run_id : `str`
387 A run id the report will be restricted to.
388 user : `str`
389 A user name the report will be restricted to.
390 hist_days : int
391 Number of days
392 pass_thru : `str`
393 A string to pass directly to the WMS service class.
394 is_global : `bool`, optional
395 If set, all available job queues will be queried for job information.
396 Defaults to False which means that only a local job queue will be
397 queried for information.
399 Only applicable in the context of a WMS using distributed job queues
400 (e.g., HTCondor).
401 """
402 if wms_service is None:
403 default_config = BpsConfig(BPS_DEFAULTS)
404 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
405 report(wms_service, run_id, user, hist_days, pass_thru, is_global=is_global)
408def cancel_driver(wms_service, run_id, user, require_bps, pass_thru, is_global=False):
409 """Cancel submitted workflows.
411 Parameters
412 ----------
413 wms_service : `str`
414 Name of the Workload Management System service class.
415 run_id : `str`
416 ID or path of job that should be canceled.
417 user : `str`
418 User whose submitted jobs should be canceled.
419 require_bps : `bool`
420 Whether to require given run_id/user to be a bps submitted job.
421 pass_thru : `str`
422 Information to pass through to WMS.
423 is_global : `bool`, optional
424 If set, all available job queues will be checked for jobs to cancel.
425 Defaults to False which means that only a local job queue will be
426 checked.
428 Only applicable in the context of a WMS using distributed job queues
429 (e.g., HTCondor).
430 """
431 if wms_service is None:
432 default_config = BpsConfig(BPS_DEFAULTS)
433 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
434 cancel(wms_service, run_id, user, require_bps, pass_thru, is_global=is_global)