Coverage for python/lsst/ctrl/bps/drivers.py: 13%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_bps.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Driver functions for each subcommand.
24Driver functions ensure that ensure all setup work is done before running
25the subcommand method.
26"""
29__all__ = [
30 "acquire_qgraph_driver",
31 "cluster_qgraph_driver",
32 "transform_driver",
33 "prepare_driver",
34 "submit_driver",
35 "report_driver",
36 "restart_driver",
37 "cancel_driver",
38]
41import errno
42import getpass
43import logging
44import os
45import re
46import shutil
47from collections.abc import Iterable
48from pathlib import Path
50from lsst.obs.base import Instrument
51from lsst.utils import doImport
52from lsst.utils.timer import time_this
53from lsst.utils.usage import get_peak_mem_usage
55from . import BPS_DEFAULTS, BPS_SEARCH_ORDER, DEFAULT_MEM_FMT, DEFAULT_MEM_UNIT, BpsConfig
56from .bps_utils import _dump_env_info, _dump_pkg_info
57from .cancel import cancel
58from .pre_transform import acquire_quantum_graph, cluster_quanta
59from .prepare import prepare
60from .report import report
61from .restart import restart
62from .submit import submit
63from .transform import transform
65_LOG = logging.getLogger(__name__)
68def _init_submission_driver(config_file, **kwargs):
69 """Initialize runtime environment.
71 Parameters
72 ----------
73 config_file : `str`
74 Name of the configuration file.
76 Returns
77 -------
78 config : `lsst.ctrl.bps.BpsConfig`
79 Batch Processing Service configuration.
80 """
81 config = BpsConfig(config_file, BPS_SEARCH_ORDER)
83 # Override config with command-line values.
84 # Handle diffs between pipetask argument names vs bps yaml
85 translation = {
86 "input": "inCollection",
87 "output_run": "outputRun",
88 "qgraph": "qgraphFile",
89 "pipeline": "pipelineYaml",
90 "wms_service": "wmsServiceClass",
91 }
92 for key, value in kwargs.items():
93 # Don't want to override config with None or empty string values.
94 if value:
95 # pipetask argument parser converts some values to list,
96 # but bps will want string.
97 if not isinstance(value, str) and isinstance(value, Iterable):
98 value = ",".join(value)
99 new_key = translation.get(key, re.sub(r"_(\S)", lambda match: match.group(1).upper(), key))
100 config[f".bps_cmdline.{new_key}"] = value
102 # If the WMS service class was not defined neither at the command line nor
103 # explicitly in config file, use the value provided by the environmental
104 # variable BPS_WMS_SERVICE_CLASS. If the variable is not set, stick to
105 # the package default.
106 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", None)
107 if wms_service is not None and "wmsServiceClass" not in config[".bps_cmdline"]:
108 default_config = BpsConfig(BPS_DEFAULTS)
109 if config["wmsServiceClass"] == default_config["wmsServiceClass"]:
110 config["wmsServiceClass"] = wms_service
112 # Set some initial values
113 config[".bps_defined.timestamp"] = Instrument.makeCollectionTimestamp()
114 if "operator" not in config:
115 config[".bps_defined.operator"] = getpass.getuser()
117 if "outCollection" in config:
118 raise KeyError("outCollection is deprecated. Replace all outCollection references with outputRun.")
120 if "outputRun" not in config:
121 raise KeyError("Must specify the output run collection using outputRun")
123 if "uniqProcName" not in config:
124 config[".bps_defined.uniqProcName"] = config["outputRun"].replace("/", "_")
126 if "submitPath" not in config:
127 raise KeyError("Must specify the submit-side run directory using submitPath")
129 # If requested, run WMS plugin checks early in submission process to
130 # ensure WMS has what it will need for prepare() or submit().
131 if kwargs.get("runWmsSubmissionChecks", False):
132 found, wms_class = config.search("wmsServiceClass")
133 if not found:
134 raise KeyError("Missing wmsServiceClass in bps config. Aborting.")
136 # Check that can import wms service class.
137 wms_service_class = doImport(wms_class)
138 wms_service = wms_service_class(config)
140 try:
141 wms_service.run_submission_checks()
142 except NotImplementedError:
143 # Allow various plugins to implement only when needed to do extra
144 # checks.
145 _LOG.debug("run_submission_checks is not implemented in %s.", wms_class)
146 else:
147 _LOG.debug("Skipping submission checks.")
149 # Make submit directory to contain all outputs.
150 submit_path = Path(config["submitPath"])
151 try:
152 submit_path.mkdir(parents=True, exist_ok=False)
153 except OSError as exc:
154 if exc.errno == errno.EEXIST:
155 reason = "Directory already exists"
156 else:
157 reason = exc.strerror
158 raise type(exc)(f"cannot create submit directory '{submit_path}': {reason}") from None
159 config[".bps_defined.submitPath"] = str(submit_path)
160 print(f"Submit dir: {submit_path}")
162 # save copy of configs (orig and expanded config)
163 shutil.copy2(config_file, submit_path)
164 with open(f"{submit_path}/{config['uniqProcName']}_config.yaml", "w") as fh:
165 config.dump(fh)
167 # Dump information about runtime environment and software versions in use.
168 _dump_env_info(f"{submit_path}/{config['uniqProcName']}.env.info.yaml")
169 _dump_pkg_info(f"{submit_path}/{config['uniqProcName']}.pkg.info.yaml")
171 return config
174def acquire_qgraph_driver(config_file, **kwargs):
175 """Read a quantum graph from a file or create one from pipeline definition.
177 Parameters
178 ----------
179 config_file : `str`
180 Name of the configuration file.
182 Returns
183 -------
184 config : `lsst.ctrl.bps.BpsConfig`
185 Updated configuration.
186 qgraph : `lsst.pipe.base.graph.QuantumGraph`
187 A graph representing quanta.
188 """
189 _LOG.info("Initializing execution environment")
190 with time_this(
191 log=_LOG,
192 level=logging.INFO,
193 prefix=None,
194 msg="Initializing execution environment completed",
195 mem_usage=True,
196 mem_unit=DEFAULT_MEM_UNIT,
197 mem_fmt=DEFAULT_MEM_FMT,
198 ):
199 config = _init_submission_driver(config_file, **kwargs)
200 submit_path = config[".bps_defined.submitPath"]
201 if _LOG.isEnabledFor(logging.INFO):
202 _LOG.info(
203 "Peak memory usage for bps process %s (main), %s (largest child process)",
204 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
205 )
207 _LOG.info("Starting acquire stage (generating and/or reading quantum graph)")
208 with time_this(
209 log=_LOG,
210 level=logging.INFO,
211 prefix=None,
212 msg="Acquire stage completed",
213 mem_usage=True,
214 mem_unit=DEFAULT_MEM_UNIT,
215 mem_fmt=DEFAULT_MEM_FMT,
216 ):
217 qgraph_file, qgraph, execution_butler_dir = acquire_quantum_graph(config, out_prefix=submit_path)
218 if _LOG.isEnabledFor(logging.INFO):
219 _LOG.info(
220 "Peak memory usage for bps process %s (main), %s (largest child process)",
221 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
222 )
224 config[".bps_defined.executionButlerDir"] = execution_butler_dir
225 config[".bps_defined.runQgraphFile"] = qgraph_file
226 return config, qgraph
229def cluster_qgraph_driver(config_file, **kwargs):
230 """Group quanta into clusters.
232 Parameters
233 ----------
234 config_file : `str`
235 Name of the configuration file.
237 Returns
238 -------
239 config : `lsst.ctrl.bps.BpsConfig`
240 Updated configuration.
241 clustered_qgraph : `lsst.ctrl.bps.ClusteredQuantumGraph`
242 A graph representing clustered quanta.
243 """
244 config, qgraph = acquire_qgraph_driver(config_file, **kwargs)
246 _LOG.info("Starting cluster stage (grouping quanta into jobs)")
247 with time_this(
248 log=_LOG,
249 level=logging.INFO,
250 prefix=None,
251 msg="Cluster stage completed",
252 mem_usage=True,
253 mem_unit=DEFAULT_MEM_UNIT,
254 mem_fmt=DEFAULT_MEM_FMT,
255 ):
256 clustered_qgraph = cluster_quanta(config, qgraph, config["uniqProcName"])
257 if _LOG.isEnabledFor(logging.INFO):
258 _LOG.info(
259 "Peak memory usage for bps process %s (main), %s (largest child process)",
260 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
261 )
263 submit_path = config[".bps_defined.submitPath"]
264 _, save_clustered_qgraph = config.search("saveClusteredQgraph", opt={"default": False})
265 if save_clustered_qgraph:
266 clustered_qgraph.save(os.path.join(submit_path, "bps_clustered_qgraph.pickle"))
267 _, save_dot = config.search("saveDot", opt={"default": False})
268 if save_dot:
269 clustered_qgraph.draw(os.path.join(submit_path, "bps_clustered_qgraph.dot"))
270 return config, clustered_qgraph
273def transform_driver(config_file, **kwargs):
274 """Create a workflow for a specific workflow management system.
276 Parameters
277 ----------
278 config_file : `str`
279 Name of the configuration file.
281 Returns
282 -------
283 generic_workflow_config : `lsst.ctrl.bps.BpsConfig`
284 Configuration to use when creating the workflow.
285 generic_workflow : `lsst.ctrl.bps.BaseWmsWorkflow`
286 Representation of the abstract/scientific workflow specific to a given
287 workflow management system.
288 """
289 config, clustered_qgraph = cluster_qgraph_driver(config_file, **kwargs)
290 submit_path = config[".bps_defined.submitPath"]
292 _LOG.info("Starting transform stage (creating generic workflow)")
293 with time_this(
294 log=_LOG,
295 level=logging.INFO,
296 prefix=None,
297 msg="Transform stage completed",
298 mem_usage=True,
299 mem_unit=DEFAULT_MEM_UNIT,
300 mem_fmt=DEFAULT_MEM_FMT,
301 ):
302 generic_workflow, generic_workflow_config = transform(config, clustered_qgraph, submit_path)
303 _LOG.info("Generic workflow name '%s'", generic_workflow.name)
304 if _LOG.isEnabledFor(logging.INFO):
305 _LOG.info(
306 "Peak memory usage for bps process %s (main), %s (largest child process)",
307 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
308 )
310 _, save_workflow = config.search("saveGenericWorkflow", opt={"default": False})
311 if save_workflow:
312 with open(os.path.join(submit_path, "bps_generic_workflow.pickle"), "wb") as outfh:
313 generic_workflow.save(outfh, "pickle")
314 _, save_dot = config.search("saveDot", opt={"default": False})
315 if save_dot:
316 with open(os.path.join(submit_path, "bps_generic_workflow.dot"), "w") as outfh:
317 generic_workflow.draw(outfh, "dot")
318 return generic_workflow_config, generic_workflow
321def prepare_driver(config_file, **kwargs):
322 """Create a representation of the generic workflow.
324 Parameters
325 ----------
326 config_file : `str`
327 Name of the configuration file.
329 Returns
330 -------
331 wms_config : `lsst.ctrl.bps.BpsConfig`
332 Configuration to use when creating the workflow.
333 workflow : `lsst.ctrl.bps.BaseWmsWorkflow`
334 Representation of the abstract/scientific workflow specific to a given
335 workflow management system.
336 """
337 kwargs.setdefault("runWmsSubmissionChecks", True)
338 generic_workflow_config, generic_workflow = transform_driver(config_file, **kwargs)
339 submit_path = generic_workflow_config[".bps_defined.submitPath"]
341 _LOG.info("Starting prepare stage (creating specific implementation of workflow)")
342 with time_this(
343 log=_LOG,
344 level=logging.INFO,
345 prefix=None,
346 msg="Prepare stage completed",
347 mem_usage=True,
348 mem_unit=DEFAULT_MEM_UNIT,
349 mem_fmt=DEFAULT_MEM_FMT,
350 ):
351 wms_workflow = prepare(generic_workflow_config, generic_workflow, submit_path)
352 if _LOG.isEnabledFor(logging.INFO):
353 _LOG.info(
354 "Peak memory usage for bps process %s (main), %s (largest child process)",
355 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
356 )
358 wms_workflow_config = generic_workflow_config
359 return wms_workflow_config, wms_workflow
362def submit_driver(config_file, **kwargs):
363 """Submit workflow for execution.
365 Parameters
366 ----------
367 config_file : `str`
368 Name of the configuration file.
369 """
370 kwargs.setdefault("runWmsSubmissionChecks", True)
372 _LOG.info(
373 "DISCLAIMER: All values regarding memory consumption reported below are approximate and may "
374 "not accurately reflect actual memory usage by the bps process."
375 )
377 _LOG.info("Starting submission process")
378 with time_this(
379 log=_LOG,
380 level=logging.INFO,
381 prefix=None,
382 msg="Completed entire submission process",
383 mem_usage=True,
384 mem_unit=DEFAULT_MEM_UNIT,
385 mem_fmt=DEFAULT_MEM_FMT,
386 ):
387 wms_workflow_config, wms_workflow = prepare_driver(config_file, **kwargs)
389 _LOG.info("Starting submit stage")
390 with time_this(
391 log=_LOG,
392 level=logging.INFO,
393 prefix=None,
394 msg="Completed submit stage",
395 mem_usage=True,
396 mem_unit=DEFAULT_MEM_UNIT,
397 mem_fmt=DEFAULT_MEM_FMT,
398 ):
399 submit(wms_workflow_config, wms_workflow)
400 _LOG.info("Run '%s' submitted for execution with id '%s'", wms_workflow.name, wms_workflow.run_id)
401 if _LOG.isEnabledFor(logging.INFO):
402 _LOG.info(
403 "Peak memory usage for bps process %s (main), %s (largest child process)",
404 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
405 )
407 print(f"Run Id: {wms_workflow.run_id}")
408 print(f"Run Name: {wms_workflow.name}")
411def restart_driver(wms_service, run_id):
412 """Restart a failed workflow.
414 Parameters
415 ----------
416 wms_service : `str`
417 Name of the class.
418 run_id : `str`
419 Id or path of workflow that need to be restarted.
420 """
421 if wms_service is None:
422 default_config = BpsConfig(BPS_DEFAULTS)
423 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
425 new_run_id, run_name, message = restart(wms_service, run_id)
426 if new_run_id is not None:
427 path = Path(run_id)
428 if path.exists():
429 _dump_env_info(f"{run_id}/{run_name}.env.info.yaml")
430 _dump_pkg_info(f"{run_id}/{run_name}.pkg.info.yaml")
431 print(f"Run Id: {new_run_id}")
432 print(f"Run Name: {run_name}")
433 else:
434 if message:
435 print(f"Restart failed: {message}")
436 else:
437 print("Restart failed: Unknown error")
440def report_driver(wms_service, run_id, user, hist_days, pass_thru, is_global=False):
441 """Print out summary of jobs submitted for execution.
443 Parameters
444 ----------
445 wms_service : `str`
446 Name of the class.
447 run_id : `str`
448 A run id the report will be restricted to.
449 user : `str`
450 A user name the report will be restricted to.
451 hist_days : int
452 Number of days
453 pass_thru : `str`
454 A string to pass directly to the WMS service class.
455 is_global : `bool`, optional
456 If set, all available job queues will be queried for job information.
457 Defaults to False which means that only a local job queue will be
458 queried for information.
460 Only applicable in the context of a WMS using distributed job queues
461 (e.g., HTCondor).
462 """
463 if wms_service is None:
464 default_config = BpsConfig(BPS_DEFAULTS)
465 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
466 report(wms_service, run_id, user, hist_days, pass_thru, is_global=is_global)
469def cancel_driver(wms_service, run_id, user, require_bps, pass_thru, is_global=False):
470 """Cancel submitted workflows.
472 Parameters
473 ----------
474 wms_service : `str`
475 Name of the Workload Management System service class.
476 run_id : `str`
477 ID or path of job that should be canceled.
478 user : `str`
479 User whose submitted jobs should be canceled.
480 require_bps : `bool`
481 Whether to require given run_id/user to be a bps submitted job.
482 pass_thru : `str`
483 Information to pass through to WMS.
484 is_global : `bool`, optional
485 If set, all available job queues will be checked for jobs to cancel.
486 Defaults to False which means that only a local job queue will be
487 checked.
489 Only applicable in the context of a WMS using distributed job queues
490 (e.g., HTCondor).
491 """
492 if wms_service is None:
493 default_config = BpsConfig(BPS_DEFAULTS)
494 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
495 cancel(wms_service, run_id, user, require_bps, pass_thru, is_global=is_global)