Coverage for python/lsst/ctrl/bps/drivers.py: 12%
177 statements
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-16 02:18 -0700
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-16 02:18 -0700
1# This file is part of ctrl_bps.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Driver functions for each subcommand.
24Driver functions ensure that ensure all setup work is done before running
25the subcommand method.
26"""
29__all__ = [
30 "acquire_qgraph_driver",
31 "cluster_qgraph_driver",
32 "transform_driver",
33 "prepare_driver",
34 "submit_driver",
35 "report_driver",
36 "restart_driver",
37 "cancel_driver",
38]
41import errno
42import getpass
43import logging
44import os
45import re
46import shutil
47from collections.abc import Iterable
48from pathlib import Path
50from lsst.pipe.base import Instrument
51from lsst.utils import doImport
52from lsst.utils.timer import time_this
53from lsst.utils.usage import get_peak_mem_usage
55from . import BPS_DEFAULTS, BPS_SEARCH_ORDER, DEFAULT_MEM_FMT, DEFAULT_MEM_UNIT, BpsConfig
56from .bps_utils import _dump_env_info, _dump_pkg_info
57from .cancel import cancel
58from .pre_transform import acquire_quantum_graph, cluster_quanta
59from .prepare import prepare
60from .report import report
61from .restart import restart
62from .submit import submit
63from .transform import transform
65_LOG = logging.getLogger(__name__)
68def _init_submission_driver(config_file, **kwargs):
69 """Initialize runtime environment.
71 Parameters
72 ----------
73 config_file : `str`
74 Name of the configuration file.
76 Returns
77 -------
78 config : `lsst.ctrl.bps.BpsConfig`
79 Batch Processing Service configuration.
80 """
81 config = BpsConfig(config_file, BPS_SEARCH_ORDER)
83 # Override config with command-line values.
84 # Handle diffs between pipetask argument names vs bps yaml
85 translation = {
86 "input": "inCollection",
87 "output_run": "outputRun",
88 "qgraph": "qgraphFile",
89 "pipeline": "pipelineYaml",
90 "wms_service": "wmsServiceClass",
91 }
92 for key, value in kwargs.items():
93 # Don't want to override config with None or empty string values.
94 if value:
95 # pipetask argument parser converts some values to list,
96 # but bps will want string.
97 if not isinstance(value, str) and isinstance(value, Iterable):
98 value = ",".join(value)
99 new_key = translation.get(key, re.sub(r"_(\S)", lambda match: match.group(1).upper(), key))
100 config[f".bps_cmdline.{new_key}"] = value
102 # If the WMS service class was not defined neither at the command line nor
103 # explicitly in config file, use the value provided by the environmental
104 # variable BPS_WMS_SERVICE_CLASS. If the variable is not set, stick to
105 # the package default.
106 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", None)
107 if wms_service is not None and "wmsServiceClass" not in config[".bps_cmdline"]:
108 default_config = BpsConfig(BPS_DEFAULTS)
109 if config["wmsServiceClass"] == default_config["wmsServiceClass"]:
110 config["wmsServiceClass"] = wms_service
112 # Set some initial values
113 config[".bps_defined.timestamp"] = Instrument.makeCollectionTimestamp()
114 if "operator" not in config:
115 config[".bps_defined.operator"] = getpass.getuser()
117 if "outCollection" in config:
118 raise KeyError("outCollection is deprecated. Replace all outCollection references with outputRun.")
120 if "outputRun" not in config:
121 raise KeyError("Must specify the output run collection using outputRun")
123 if "uniqProcName" not in config:
124 config[".bps_defined.uniqProcName"] = config["outputRun"].replace("/", "_")
126 if "submitPath" not in config:
127 raise KeyError("Must specify the submit-side run directory using submitPath")
129 # If requested, run WMS plugin checks early in submission process to
130 # ensure WMS has what it will need for prepare() or submit().
131 if kwargs.get("runWmsSubmissionChecks", False):
132 found, wms_class = config.search("wmsServiceClass")
133 if not found:
134 raise KeyError("Missing wmsServiceClass in bps config. Aborting.")
136 # Check that can import wms service class.
137 wms_service_class = doImport(wms_class)
138 wms_service = wms_service_class(config)
140 try:
141 wms_service.run_submission_checks()
142 except NotImplementedError:
143 # Allow various plugins to implement only when needed to do extra
144 # checks.
145 _LOG.debug("run_submission_checks is not implemented in %s.", wms_class)
146 else:
147 _LOG.debug("Skipping submission checks.")
149 # Make submit directory to contain all outputs.
150 submit_path = Path(config["submitPath"])
151 try:
152 submit_path.mkdir(parents=True, exist_ok=False)
153 except OSError as exc:
154 if exc.errno == errno.EEXIST:
155 reason = "Directory already exists"
156 else:
157 reason = exc.strerror
158 raise type(exc)(f"cannot create submit directory '{submit_path}': {reason}") from None
159 config[".bps_defined.submitPath"] = str(submit_path)
160 print(f"Submit dir: {submit_path}")
162 # save copy of configs (orig and expanded config)
163 shutil.copy2(config_file, submit_path)
164 with open(f"{submit_path}/{config['uniqProcName']}_config.yaml", "w") as fh:
165 config.dump(fh)
167 # Dump information about runtime environment and software versions in use.
168 _dump_env_info(f"{submit_path}/{config['uniqProcName']}.env.info.yaml")
169 _dump_pkg_info(f"{submit_path}/{config['uniqProcName']}.pkg.info.yaml")
171 return config
174def acquire_qgraph_driver(config_file, **kwargs):
175 """Read a quantum graph from a file or create one from pipeline definition.
177 Parameters
178 ----------
179 config_file : `str`
180 Name of the configuration file.
182 Returns
183 -------
184 config : `lsst.ctrl.bps.BpsConfig`
185 Updated configuration.
186 qgraph : `lsst.pipe.base.graph.QuantumGraph`
187 A graph representing quanta.
188 """
189 _LOG.info("Initializing execution environment")
190 with time_this(
191 log=_LOG,
192 level=logging.INFO,
193 prefix=None,
194 msg="Initializing execution environment completed",
195 mem_usage=True,
196 mem_unit=DEFAULT_MEM_UNIT,
197 mem_fmt=DEFAULT_MEM_FMT,
198 ):
199 config = _init_submission_driver(config_file, **kwargs)
200 submit_path = config[".bps_defined.submitPath"]
201 if _LOG.isEnabledFor(logging.INFO):
202 _LOG.info(
203 "Peak memory usage for bps process %s (main), %s (largest child process)",
204 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
205 )
207 _LOG.info("Starting acquire stage (generating and/or reading quantum graph)")
208 with time_this(
209 log=_LOG,
210 level=logging.INFO,
211 prefix=None,
212 msg="Acquire stage completed",
213 mem_usage=True,
214 mem_unit=DEFAULT_MEM_UNIT,
215 mem_fmt=DEFAULT_MEM_FMT,
216 ):
217 qgraph_file, qgraph, execution_butler_dir = acquire_quantum_graph(config, out_prefix=submit_path)
218 if _LOG.isEnabledFor(logging.INFO):
219 _LOG.info(
220 "Peak memory usage for bps process %s (main), %s (largest child process)",
221 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
222 )
224 config[".bps_defined.executionButlerDir"] = execution_butler_dir
225 config[".bps_defined.runQgraphFile"] = qgraph_file
226 return config, qgraph
229def cluster_qgraph_driver(config_file, **kwargs):
230 """Group quanta into clusters.
232 Parameters
233 ----------
234 config_file : `str`
235 Name of the configuration file.
237 Returns
238 -------
239 config : `lsst.ctrl.bps.BpsConfig`
240 Updated configuration.
241 clustered_qgraph : `lsst.ctrl.bps.ClusteredQuantumGraph`
242 A graph representing clustered quanta.
243 """
244 config, qgraph = acquire_qgraph_driver(config_file, **kwargs)
246 _LOG.info("Starting cluster stage (grouping quanta into jobs)")
247 with time_this(
248 log=_LOG,
249 level=logging.INFO,
250 prefix=None,
251 msg="Cluster stage completed",
252 mem_usage=True,
253 mem_unit=DEFAULT_MEM_UNIT,
254 mem_fmt=DEFAULT_MEM_FMT,
255 ):
256 clustered_qgraph = cluster_quanta(config, qgraph, config["uniqProcName"])
257 if _LOG.isEnabledFor(logging.INFO):
258 _LOG.info(
259 "Peak memory usage for bps process %s (main), %s (largest child process)",
260 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
261 )
262 _LOG.info("ClusteredQuantumGraph contains %d cluster(s)", len(clustered_qgraph))
264 submit_path = config[".bps_defined.submitPath"]
265 _, save_clustered_qgraph = config.search("saveClusteredQgraph", opt={"default": False})
266 if save_clustered_qgraph:
267 clustered_qgraph.save(os.path.join(submit_path, "bps_clustered_qgraph.pickle"))
268 _, save_dot = config.search("saveDot", opt={"default": False})
269 if save_dot:
270 clustered_qgraph.draw(os.path.join(submit_path, "bps_clustered_qgraph.dot"))
271 return config, clustered_qgraph
274def transform_driver(config_file, **kwargs):
275 """Create a workflow for a specific workflow management system.
277 Parameters
278 ----------
279 config_file : `str`
280 Name of the configuration file.
282 Returns
283 -------
284 generic_workflow_config : `lsst.ctrl.bps.BpsConfig`
285 Configuration to use when creating the workflow.
286 generic_workflow : `lsst.ctrl.bps.BaseWmsWorkflow`
287 Representation of the abstract/scientific workflow specific to a given
288 workflow management system.
289 """
290 config, clustered_qgraph = cluster_qgraph_driver(config_file, **kwargs)
291 submit_path = config[".bps_defined.submitPath"]
293 _LOG.info("Starting transform stage (creating generic workflow)")
294 with time_this(
295 log=_LOG,
296 level=logging.INFO,
297 prefix=None,
298 msg="Transform stage completed",
299 mem_usage=True,
300 mem_unit=DEFAULT_MEM_UNIT,
301 mem_fmt=DEFAULT_MEM_FMT,
302 ):
303 generic_workflow, generic_workflow_config = transform(config, clustered_qgraph, submit_path)
304 _LOG.info("Generic workflow name '%s'", generic_workflow.name)
305 if _LOG.isEnabledFor(logging.INFO):
306 _LOG.info(
307 "Peak memory usage for bps process %s (main), %s (largest child process)",
308 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
309 )
310 num_jobs = sum(generic_workflow.job_counts.values())
311 _LOG.info("GenericWorkflow contains %d job(s) (including final)", num_jobs)
313 _, save_workflow = config.search("saveGenericWorkflow", opt={"default": False})
314 if save_workflow:
315 with open(os.path.join(submit_path, "bps_generic_workflow.pickle"), "wb") as outfh:
316 generic_workflow.save(outfh, "pickle")
317 _, save_dot = config.search("saveDot", opt={"default": False})
318 if save_dot:
319 with open(os.path.join(submit_path, "bps_generic_workflow.dot"), "w") as outfh:
320 generic_workflow.draw(outfh, "dot")
321 return generic_workflow_config, generic_workflow
324def prepare_driver(config_file, **kwargs):
325 """Create a representation of the generic workflow.
327 Parameters
328 ----------
329 config_file : `str`
330 Name of the configuration file.
332 Returns
333 -------
334 wms_config : `lsst.ctrl.bps.BpsConfig`
335 Configuration to use when creating the workflow.
336 workflow : `lsst.ctrl.bps.BaseWmsWorkflow`
337 Representation of the abstract/scientific workflow specific to a given
338 workflow management system.
339 """
340 kwargs.setdefault("runWmsSubmissionChecks", True)
341 generic_workflow_config, generic_workflow = transform_driver(config_file, **kwargs)
342 submit_path = generic_workflow_config[".bps_defined.submitPath"]
344 _LOG.info("Starting prepare stage (creating specific implementation of workflow)")
345 with time_this(
346 log=_LOG,
347 level=logging.INFO,
348 prefix=None,
349 msg="Prepare stage completed",
350 mem_usage=True,
351 mem_unit=DEFAULT_MEM_UNIT,
352 mem_fmt=DEFAULT_MEM_FMT,
353 ):
354 wms_workflow = prepare(generic_workflow_config, generic_workflow, submit_path)
355 if _LOG.isEnabledFor(logging.INFO):
356 _LOG.info(
357 "Peak memory usage for bps process %s (main), %s (largest child process)",
358 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
359 )
361 wms_workflow_config = generic_workflow_config
362 return wms_workflow_config, wms_workflow
365def submit_driver(config_file, **kwargs):
366 """Submit workflow for execution.
368 Parameters
369 ----------
370 config_file : `str`
371 Name of the configuration file.
372 """
373 kwargs.setdefault("runWmsSubmissionChecks", True)
375 _LOG.info(
376 "DISCLAIMER: All values regarding memory consumption reported below are approximate and may "
377 "not accurately reflect actual memory usage by the bps process."
378 )
380 _LOG.info("Starting submission process")
381 with time_this(
382 log=_LOG,
383 level=logging.INFO,
384 prefix=None,
385 msg="Completed entire submission process",
386 mem_usage=True,
387 mem_unit=DEFAULT_MEM_UNIT,
388 mem_fmt=DEFAULT_MEM_FMT,
389 ):
390 wms_workflow_config, wms_workflow = prepare_driver(config_file, **kwargs)
392 _LOG.info("Starting submit stage")
393 with time_this(
394 log=_LOG,
395 level=logging.INFO,
396 prefix=None,
397 msg="Completed submit stage",
398 mem_usage=True,
399 mem_unit=DEFAULT_MEM_UNIT,
400 mem_fmt=DEFAULT_MEM_FMT,
401 ):
402 submit(wms_workflow_config, wms_workflow)
403 _LOG.info("Run '%s' submitted for execution with id '%s'", wms_workflow.name, wms_workflow.run_id)
404 if _LOG.isEnabledFor(logging.INFO):
405 _LOG.info(
406 "Peak memory usage for bps process %s (main), %s (largest child process)",
407 *tuple(f"{val.to(DEFAULT_MEM_UNIT):{DEFAULT_MEM_FMT}}" for val in get_peak_mem_usage()),
408 )
410 print(f"Run Id: {wms_workflow.run_id}")
411 print(f"Run Name: {wms_workflow.name}")
414def restart_driver(wms_service, run_id):
415 """Restart a failed workflow.
417 Parameters
418 ----------
419 wms_service : `str`
420 Name of the class.
421 run_id : `str`
422 Id or path of workflow that need to be restarted.
423 """
424 if wms_service is None:
425 default_config = BpsConfig(BPS_DEFAULTS)
426 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
428 new_run_id, run_name, message = restart(wms_service, run_id)
429 if new_run_id is not None:
430 path = Path(run_id)
431 if path.exists():
432 _dump_env_info(f"{run_id}/{run_name}.env.info.yaml")
433 _dump_pkg_info(f"{run_id}/{run_name}.pkg.info.yaml")
434 print(f"Run Id: {new_run_id}")
435 print(f"Run Name: {run_name}")
436 else:
437 if message:
438 print(f"Restart failed: {message}")
439 else:
440 print("Restart failed: Unknown error")
443def report_driver(wms_service, run_id, user, hist_days, pass_thru, is_global=False):
444 """Print out summary of jobs submitted for execution.
446 Parameters
447 ----------
448 wms_service : `str`
449 Name of the class.
450 run_id : `str`
451 A run id the report will be restricted to.
452 user : `str`
453 A user name the report will be restricted to.
454 hist_days : int
455 Number of days
456 pass_thru : `str`
457 A string to pass directly to the WMS service class.
458 is_global : `bool`, optional
459 If set, all available job queues will be queried for job information.
460 Defaults to False which means that only a local job queue will be
461 queried for information.
463 Only applicable in the context of a WMS using distributed job queues
464 (e.g., HTCondor).
465 """
466 if wms_service is None:
467 default_config = BpsConfig(BPS_DEFAULTS)
468 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
469 report(wms_service, run_id, user, hist_days, pass_thru, is_global=is_global)
472def cancel_driver(wms_service, run_id, user, require_bps, pass_thru, is_global=False):
473 """Cancel submitted workflows.
475 Parameters
476 ----------
477 wms_service : `str`
478 Name of the Workload Management System service class.
479 run_id : `str`
480 ID or path of job that should be canceled.
481 user : `str`
482 User whose submitted jobs should be canceled.
483 require_bps : `bool`
484 Whether to require given run_id/user to be a bps submitted job.
485 pass_thru : `str`
486 Information to pass through to WMS.
487 is_global : `bool`, optional
488 If set, all available job queues will be checked for jobs to cancel.
489 Defaults to False which means that only a local job queue will be
490 checked.
492 Only applicable in the context of a WMS using distributed job queues
493 (e.g., HTCondor).
494 """
495 if wms_service is None:
496 default_config = BpsConfig(BPS_DEFAULTS)
497 wms_service = os.environ.get("BPS_WMS_SERVICE_CLASS", default_config["wmsServiceClass"])
498 cancel(wms_service, run_id, user, require_bps, pass_thru, is_global=is_global)