Coverage for python/lsst/ctrl/bps/drivers.py: 16%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_bps.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Driver functions for each subcommand.
24Driver functions ensure that ensure all setup work is done before running
25the subcommand method.
26"""
29__all__ = [
30 "acquire_qgraph_driver",
31 "cluster_qgraph_driver",
32 "transform_driver",
33 "prepare_driver",
34 "submit_driver",
35 "report_driver",
36 "restart_driver",
37 "cancel_driver",
38]
41import errno
42import getpass
43import logging
44import os
45import re
46import shutil
47from collections.abc import Iterable
48from pathlib import Path
50from lsst.obs.base import Instrument
51from lsst.utils import doImport
52from lsst.utils.timer import time_this
54from . import BPS_DEFAULTS, BPS_SEARCH_ORDER, BpsConfig
55from .pre_transform import acquire_quantum_graph, cluster_quanta
56from .transform import transform
57from .prepare import prepare
58from .submit import submit
59from .cancel import cancel
60from .report import report
61from .restart import restart
62from .bps_utils import _dump_env_info, _dump_pkg_info
65_LOG = logging.getLogger(__name__)
68def _init_submission_driver(config_file, **kwargs):
69 """Initialize runtime environment.
71 Parameters
72 ----------
73 config_file : `str`
74 Name of the configuration file.
76 Returns
77 -------
78 config : `lsst.ctrl.bps.BpsConfig`
79 Batch Processing Service configuration.
80 """
81 config = BpsConfig(config_file, BPS_SEARCH_ORDER)
83 # Override config with command-line values.
84 # Handle diffs between pipetask argument names vs bps yaml
85 translation = {
86 "input": "inCollection",
87 "output_run": "outputRun",
88 "qgraph": "qgraphFile",
89 "pipeline": "pipelineYaml",
90 "wms_service": "wmsServiceClass",
91 }
92 for key, value in kwargs.items():
93 # Don't want to override config with None or empty string values.
94 if value:
95 # pipetask argument parser converts some values to list,
96 # but bps will want string.
97 if not isinstance(value, str) and isinstance(value, Iterable):
98 value = ",".join(value)
99 new_key = translation.get(key, re.sub(r"_(\S)", lambda match: match.group(1).upper(), key))
100 config[f".bps_cmdline.{new_key}"] = value
102 # If the WMS service class was not defined neither at the command line nor
103 # explicitly in config file, use the value provided by the environmental
104 # variable BPS_WMS_SERVICE_CLASS. If the variable is not set, stick to
105 # the package default.
106 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", None)
107 if wms_service is not None and "wmsServiceClass" not in config[".bps_cmdline"]:
108 default_config = BpsConfig(BPS_DEFAULTS)
109 if config["wmsServiceClass"] == default_config["wmsServiceClass"]:
110 config["wmsServiceClass"] = wms_service
112 # Set some initial values
113 config[".bps_defined.timestamp"] = Instrument.makeCollectionTimestamp()
114 if "operator" not in config:
115 config[".bps_defined.operator"] = getpass.getuser()
117 if "outCollection" in config:
118 raise KeyError("outCollection is deprecated. Replace all outCollection references with outputRun.")
120 if "outputRun" not in config:
121 raise KeyError("Must specify the output run collection using outputRun")
123 if "uniqProcName" not in config:
124 config[".bps_defined.uniqProcName"] = config["outputRun"].replace("/", "_")
126 if "submitPath" not in config:
127 raise KeyError("Must specify the submit-side run directory using submitPath")
129 # If requested, run WMS plugin checks early in submission process to
130 # ensure WMS has what it will need for prepare() or submit().
131 if kwargs.get("runWmsSubmissionChecks", False):
132 found, wms_class = config.search("wmsServiceClass")
133 if not found:
134 raise KeyError("Missing wmsServiceClass in bps config. Aborting.")
136 # Check that can import wms service class.
137 wms_service_class = doImport(wms_class)
138 wms_service = wms_service_class(config)
140 try:
141 wms_service.run_submission_checks()
142 except NotImplementedError:
143 # Allow various plugins to implement only when needed to do extra
144 # checks.
145 _LOG.debug("run_submission_checks is not implemented in %s.", wms_class)
146 else:
147 _LOG.debug("Skipping submission checks.")
149 # Make submit directory to contain all outputs.
150 submit_path = Path(config["submitPath"])
151 try:
152 submit_path.mkdir(parents=True, exist_ok=False)
153 except OSError as exc:
154 if exc.errno == errno.EEXIST:
155 reason = "Directory already exists"
156 else:
157 reason = exc.strerror
158 raise type(exc)(f"cannot create submit directory '{submit_path}': {reason}") from None
159 config[".bps_defined.submitPath"] = str(submit_path)
160 print(f"Submit dir: {submit_path}")
162 # save copy of configs (orig and expanded config)
163 shutil.copy2(config_file, submit_path)
164 with open(f"{submit_path}/{config['uniqProcName']}_config.yaml", "w") as fh:
165 config.dump(fh)
167 # Dump information about runtime environment and software versions in use.
168 _dump_env_info(f"{submit_path}/{config['uniqProcName']}.env.info.yaml")
169 _dump_pkg_info(f"{submit_path}/{config['uniqProcName']}.pkg.info.yaml")
171 return config
174def acquire_qgraph_driver(config_file, **kwargs):
175 """Read a quantum graph from a file or create one from pipeline definition.
177 Parameters
178 ----------
179 config_file : `str`
180 Name of the configuration file.
182 Returns
183 -------
184 config : `lsst.ctrl.bps.BpsConfig`
185 Updated configuration.
186 qgraph : `lsst.pipe.base.graph.QuantumGraph`
187 A graph representing quanta.
188 """
189 config = _init_submission_driver(config_file, **kwargs)
190 submit_path = config[".bps_defined.submitPath"]
192 _LOG.info("Starting acquire stage (generating and/or reading quantum graph)")
193 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Acquire stage completed"):
194 qgraph_file, qgraph, execution_butler_dir = acquire_quantum_graph(config, out_prefix=submit_path)
196 config[".bps_defined.executionButlerDir"] = execution_butler_dir
197 config[".bps_defined.runQgraphFile"] = qgraph_file
198 return config, qgraph
201def cluster_qgraph_driver(config_file, **kwargs):
202 """Group quanta into clusters.
204 Parameters
205 ----------
206 config_file : `str`
207 Name of the configuration file.
209 Returns
210 -------
211 config : `lsst.ctrl.bps.BpsConfig`
212 Updated configuration.
213 clustered_qgraph : `lsst.ctrl.bps.ClusteredQuantumGraph`
214 A graph representing clustered quanta.
215 """
216 config, qgraph = acquire_qgraph_driver(config_file, **kwargs)
218 _LOG.info("Starting cluster stage (grouping quanta into jobs)")
219 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Cluster stage completed"):
220 clustered_qgraph = cluster_quanta(config, qgraph, config["uniqProcName"])
222 submit_path = config[".bps_defined.submitPath"]
223 _, save_clustered_qgraph = config.search("saveClusteredQgraph", opt={"default": False})
224 if save_clustered_qgraph:
225 clustered_qgraph.save(os.path.join(submit_path, "bps_clustered_qgraph.pickle"))
226 _, save_dot = config.search("saveDot", opt={"default": False})
227 if save_dot:
228 clustered_qgraph.draw(os.path.join(submit_path, "bps_clustered_qgraph.dot"))
229 return config, clustered_qgraph
232def transform_driver(config_file, **kwargs):
233 """Create a workflow for a specific workflow management system.
235 Parameters
236 ----------
237 config_file : `str`
238 Name of the configuration file.
240 Returns
241 -------
242 generic_workflow_config : `lsst.ctrl.bps.BpsConfig`
243 Configuration to use when creating the workflow.
244 generic_workflow : `lsst.ctrl.bps.BaseWmsWorkflow`
245 Representation of the abstract/scientific workflow specific to a given
246 workflow management system.
247 """
248 config, clustered_qgraph = cluster_qgraph_driver(config_file, **kwargs)
249 submit_path = config[".bps_defined.submitPath"]
251 _LOG.info("Starting transform stage (creating generic workflow)")
252 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Transform stage completed"):
253 generic_workflow, generic_workflow_config = transform(config, clustered_qgraph, submit_path)
254 _LOG.info("Generic workflow name '%s'", generic_workflow.name)
256 _, save_workflow = config.search("saveGenericWorkflow", opt={"default": False})
257 if save_workflow:
258 with open(os.path.join(submit_path, "bps_generic_workflow.pickle"), "wb") as outfh:
259 generic_workflow.save(outfh, "pickle")
260 _, save_dot = config.search("saveDot", opt={"default": False})
261 if save_dot:
262 with open(os.path.join(submit_path, "bps_generic_workflow.dot"), "w") as outfh:
263 generic_workflow.draw(outfh, "dot")
264 return generic_workflow_config, generic_workflow
267def prepare_driver(config_file, **kwargs):
268 """Create a representation of the generic workflow.
270 Parameters
271 ----------
272 config_file : `str`
273 Name of the configuration file.
275 Returns
276 -------
277 wms_config : `lsst.ctrl.bps.BpsConfig`
278 Configuration to use when creating the workflow.
279 workflow : `lsst.ctrl.bps.BaseWmsWorkflow`
280 Representation of the abstract/scientific workflow specific to a given
281 workflow management system.
282 """
283 kwargs.setdefault("runWmsSubmissionChecks", True)
284 generic_workflow_config, generic_workflow = transform_driver(config_file, **kwargs)
285 submit_path = generic_workflow_config[".bps_defined.submitPath"]
287 _LOG.info("Starting prepare stage (creating specific implementation of workflow)")
288 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Prepare stage completed"):
289 wms_workflow = prepare(generic_workflow_config, generic_workflow, submit_path)
291 wms_workflow_config = generic_workflow_config
292 return wms_workflow_config, wms_workflow
295def submit_driver(config_file, **kwargs):
296 """Submit workflow for execution.
298 Parameters
299 ----------
300 config_file : `str`
301 Name of the configuration file.
302 """
303 kwargs.setdefault("runWmsSubmissionChecks", True)
305 _LOG.info("Starting submission process")
306 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Completed entire submission process"):
307 wms_workflow_config, wms_workflow = prepare_driver(config_file, **kwargs)
309 _LOG.info("Starting submit stage")
310 with time_this(log=_LOG, level=logging.INFO, prefix=None, msg="Completed submit stage"):
311 submit(wms_workflow_config, wms_workflow)
312 _LOG.info("Run '%s' submitted for execution with id '%s'", wms_workflow.name, wms_workflow.run_id)
314 print(f"Run Id: {wms_workflow.run_id}")
315 print(f"Run Name: {wms_workflow.name}")
318def restart_driver(wms_service, run_id):
319 """Restart a failed workflow.
321 Parameters
322 ----------
323 wms_service : `str`
324 Name of the class.
325 run_id : `str`
326 Id or path of workflow that need to be restarted.
327 """
328 if wms_service is None:
329 default_config = BpsConfig(BPS_DEFAULTS)
330 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
332 new_run_id, run_name, message = restart(wms_service, run_id)
333 if new_run_id is not None:
334 path = Path(run_id)
335 if path.exists():
336 _dump_env_info(f"{run_id}/{run_name}.env.info.yaml")
337 _dump_pkg_info(f"{run_id}/{run_name}.pkg.info.yaml")
338 print(f"Run Id: {new_run_id}")
339 print(f"Run Name: {run_name}")
340 else:
341 if message:
342 print(f"Restart failed: {message}")
343 else:
344 print("Restart failed: Unknown error")
347def report_driver(wms_service, run_id, user, hist_days, pass_thru, is_global=False):
348 """Print out summary of jobs submitted for execution.
350 Parameters
351 ----------
352 wms_service : `str`
353 Name of the class.
354 run_id : `str`
355 A run id the report will be restricted to.
356 user : `str`
357 A user name the report will be restricted to.
358 hist_days : int
359 Number of days
360 pass_thru : `str`
361 A string to pass directly to the WMS service class.
362 is_global : `bool`, optional
363 If set, all available job queues will be queried for job information.
364 Defaults to False which means that only a local job queue will be
365 queried for information.
367 Only applicable in the context of a WMS using distributed job queues
368 (e.g., HTCondor).
369 """
370 if wms_service is None:
371 default_config = BpsConfig(BPS_DEFAULTS)
372 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
373 report(wms_service, run_id, user, hist_days, pass_thru, is_global=is_global)
376def cancel_driver(wms_service, run_id, user, require_bps, pass_thru, is_global=False):
377 """Cancel submitted workflows.
379 Parameters
380 ----------
381 wms_service : `str`
382 Name of the Workload Management System service class.
383 run_id : `str`
384 ID or path of job that should be canceled.
385 user : `str`
386 User whose submitted jobs should be canceled.
387 require_bps : `bool`
388 Whether to require given run_id/user to be a bps submitted job.
389 pass_thru : `str`
390 Information to pass through to WMS.
391 is_global : `bool`, optional
392 If set, all available job queues will be checked for jobs to cancel.
393 Defaults to False which means that only a local job queue will be
394 checked.
396 Only applicable in the context of a WMS using distributed job queues
397 (e.g., HTCondor).
398 """
399 if wms_service is None:
400 default_config = BpsConfig(BPS_DEFAULTS)
401 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
402 cancel(wms_service, run_id, user, require_bps, pass_thru, is_global=is_global)