Coverage for python/lsst/ctrl/execute/allocator.py : 68%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#!/usr/bin/env python
3#
4# LSST Data Management System
5# Copyright 2008-2016 LSST Corporation.
6#
7# This product includes software developed by the
8# LSST Project (http://www.lsst.org/).
9#
10# This program is free software: you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation, either version 3 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the LSST License Statement and
21# the GNU General Public License along with this program. If not,
22# see <http://www.lsstcorp.org/LegalNotices/>.
23#
25import os
26import sys
27import pwd
28from datetime import datetime
29from string import Template
30from lsst.ctrl.execute import envString
31from lsst.ctrl.execute.allocationConfig import AllocationConfig
32from lsst.ctrl.execute.condorInfoConfig import CondorInfoConfig
33from lsst.ctrl.execute.templateWriter import TemplateWriter
34from lsst.ctrl.execute.seqFile import SeqFile
37class Allocator:
38 """A class which consolidates allocation pex_config information with
39 override information (obtained from the command line) and produces a
40 PBS file using these values.
42 Parameters
43 ----------
44 platform : `str`
45 the name of the platform to execute on
46 opts : `Config`
47 Config object containing options
48 condorInfoFileName : `str`
49 Name of the file containing Config information
50 """
52 def __init__(self, platform, opts, configuration, condorInfoFileName):
53 """Constructor
54 @param platform: target platform for PBS submission
55 @param opts: options to override
56 """
57 self.opts = opts
58 self.defaults = {}
59 self.configuration = configuration
61 fileName = envString.resolve(condorInfoFileName)
62 condorInfoConfig = CondorInfoConfig()
63 condorInfoConfig.load(fileName)
65 self.platform = platform
67 # Look up the user's name and home directory in the
68 # $HOME/.lsst/condor-info.py file
69 # If the platform is lsst, and the user_name or user_home
70 # is not in there, then default to user running this
71 # command and the value of $HOME, respectively.
72 user_name = None
73 user_home = None
74 for name in condorInfoConfig.platform:
75 if name == self.platform:
76 user_name = condorInfoConfig.platform[name].user.name
77 user_home = condorInfoConfig.platform[name].user.home
79 if self.platform == "lsst": 79 ↛ 85line 79 didn't jump to line 85, because the condition on line 79 was never false
80 if user_name is None: 80 ↛ 81line 80 didn't jump to line 81, because the condition on line 80 was never true
81 user_name = pwd.getpwuid(os.geteuid()).pw_name
82 if user_home is None: 82 ↛ 83line 82 didn't jump to line 83, because the condition on line 82 was never true
83 user_home = os.getenv('HOME')
85 if user_name is None: 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true
86 raise RuntimeError("error: %s does not specify user name for platform == %s" %
87 (condorInfoFileName, self.platform))
88 if user_home is None: 88 ↛ 89line 88 didn't jump to line 89, because the condition on line 88 was never true
89 raise RuntimeError("error: %s does not specify user home for platform == %s" %
90 (condorInfoFileName, self.platform))
92 self.defaults["USER_NAME"] = user_name
93 self.defaults["USER_HOME"] = user_home
95 self.commandLineDefaults = {}
97 self.commandLineDefaults["NODE_COUNT"] = self.opts.nodeCount
98 self.commandLineDefaults["CPUS"] = self.opts.cpus
99 self.commandLineDefaults["WALL_CLOCK"] = self.opts.maximumWallClock
101 self.commandLineDefaults["QUEUE"] = self.opts.queue
102 if self.opts.email == "no": 102 ↛ 103line 102 didn't jump to line 103, because the condition on line 102 was never true
103 self.commandLineDefaults["EMAIL_NOTIFICATION"] = "#"
105 self.load()
107 def createNodeSetName(self):
108 """Creates the next "node_set" name, using the remote user name and
109 a stored sequence number.
111 Returns
112 -------
113 nodeSetName : `str`
114 the new node_set name
115 """
116 s = SeqFile("$HOME/.lsst/node-set.seq")
117 n = s.nextSeq()
118 nodeSetName = "%s_%d" % (self.defaults["USER_NAME"], n)
119 return nodeSetName
121 def createUniqueIdentifier(self):
122 """Creates a unique file identifier, based on the user's name
123 and the time at which this method is invoked.
125 Returns
126 -------
127 ident : `str`
128 the new identifier
129 """
130 # This naming scheme follows the conventions used for creating
131 # RUNID names. We've found this allows these files to be more
132 # easily located and shared with other users when debugging
133 # The tempfile.mkstemp method restricts the file to only the user,
134 # and does not guarantee a file name can that easily be identified.
135 now = datetime.now()
136 username = pwd.getpwuid(os.geteuid()).pw_name
137 ident = "%s_%02d_%02d%02d_%02d%02d%02d" % (
138 username, now.year, now.month, now.day, now.hour, now.minute, now.second)
139 return ident
141 def load(self):
142 """Loads all values from configuration and command line overrides into
143 data structures suitable for use by the TemplateWriter object.
144 """
145 tempLocalScratch = Template(self.configuration.platform.localScratch)
146 self.defaults["LOCAL_SCRATCH"] = tempLocalScratch.substitute(USER_NAME=self.defaults["USER_NAME"])
147 # print("localScratch-> %s" % self.defaults["LOCAL_SCRATCH"])
148 self.defaults["SCHEDULER"] = self.configuration.platform.scheduler
150 def loadAllocationConfig(self, name, suffix):
151 """Loads all values from allocationConfig and command line overrides
152 into data structures suitable for use by the TemplateWriter object.
153 """
154 resolvedName = envString.resolve(name)
155 allocationConfig = AllocationConfig()
156 if not os.path.exists(resolvedName): 156 ↛ 157line 156 didn't jump to line 157, because the condition on line 156 was never true
157 raise RuntimeError("%s was not found." % resolvedName)
158 allocationConfig.load(resolvedName)
160 self.defaults["QUEUE"] = allocationConfig.platform.queue
161 self.defaults["EMAIL_NOTIFICATION"] = allocationConfig.platform.email
162 self.defaults["HOST_NAME"] = allocationConfig.platform.loginHostName
164 self.defaults["UTILITY_PATH"] = allocationConfig.platform.utilityPath
166 if self.opts.glideinShutdown is None: 166 ↛ 169line 166 didn't jump to line 169, because the condition on line 166 was never false
167 self.defaults["GLIDEIN_SHUTDOWN"] = str(allocationConfig.platform.glideinShutdown)
168 else:
169 self.defaults["GLIDEIN_SHUTDOWN"] = str(self.opts.glideinShutdown)
171 if self.opts.nodeSet is None: 171 ↛ 172line 171 didn't jump to line 172, because the condition on line 171 was never true
172 self.defaults["NODE_SET"] = self.createNodeSetName()
173 else:
174 self.defaults["NODE_SET"] = self.opts.nodeSet
176 nodeSetName = self.defaults["NODE_SET"]
178 if self.opts.outputLog is not None: 178 ↛ 181line 178 didn't jump to line 181, because the condition on line 178 was never false
179 self.defaults["OUTPUT_LOG"] = self.opts.outputLog
180 else:
181 self.defaults["OUTPUT_LOG"] = "%s.out" % nodeSetName
183 if self.opts.errorLog is not None: 183 ↛ 186line 183 didn't jump to line 186, because the condition on line 183 was never false
184 self.defaults["ERROR_LOG"] = self.opts.errorLog
185 else:
186 self.defaults["ERROR_LOG"] = "%s.err" % nodeSetName
188 # This is the TOTAL number of cores in the job, not just the total
189 # of the cores you intend to use. In other words, the total available
190 # on a machine, times the number of machines.
191 totalCoresPerNode = allocationConfig.platform.totalCoresPerNode
192 self.commandLineDefaults["TOTAL_CORE_COUNT"] = self.opts.nodeCount * totalCoresPerNode
194 self.uniqueIdentifier = self.createUniqueIdentifier()
196 # write these pbs and config files to {LOCAL_DIR}/configs
197 self.configDir = os.path.join(self.defaults["LOCAL_SCRATCH"], "configs")
198 if not os.path.exists(self.configDir): 198 ↛ 201line 198 didn't jump to line 201, because the condition on line 198 was never false
199 os.makedirs(self.configDir)
201 self.submitFileName = os.path.join(self.configDir, "alloc_%s.%s" % (self.uniqueIdentifier, suffix))
203 self.condorConfigFileName = os.path.join(self.configDir, "condor_%s.config" % self.uniqueIdentifier)
205 self.defaults["GENERATED_CONFIG"] = os.path.basename(self.condorConfigFileName)
206 self.defaults["CONFIGURATION_ID"] = self.uniqueIdentifier
207 return allocationConfig
209 def createSubmitFile(self, inputFile):
210 """Creates a PBS file using the file "input" as a Template
212 Returns
213 -------
214 outfile : `str`
215 The newly created file name
216 """
217 outfile = self.createFile(inputFile, self.submitFileName)
218 if self.opts.verbose: 218 ↛ 219line 218 didn't jump to line 219, because the condition on line 218 was never true
219 print("wrote new PBS file to %s" % outfile)
220 return outfile
222 def createCondorConfigFile(self, input):
223 """Creates a Condor config file using the file "input" as a Template
225 Returns
226 -------
227 outfile : `str`
228 The newly created file name
229 """
230 outfile = self.createFile(input, self.condorConfigFileName)
231 if self.opts.verbose: 231 ↛ 232line 231 didn't jump to line 232, because the condition on line 231 was never true
232 print("wrote new condor_config file to %s" % outfile)
233 return outfile
235 def createFile(self, input, output):
236 """Creates a new file, using "input" as a Template, and writes the
237 new file to output.
239 Returns
240 -------
241 outfile : `str`
242 The newly created file name
243 """
244 resolvedInputName = envString.resolve(input)
245 if self.opts.verbose: 245 ↛ 246line 245 didn't jump to line 246, because the condition on line 245 was never true
246 print("creating file using %s" % resolvedInputName)
247 template = TemplateWriter()
248 # Uses the associative arrays of "defaults" and "commandLineDefaults"
249 # to write out the new file from the template.
250 # The commandLineDefaults override values in "defaults"
251 substitutes = self.defaults.copy()
252 for key in self.commandLineDefaults:
253 val = self.commandLineDefaults[key]
254 if val is not None: 254 ↛ 252line 254 didn't jump to line 252, because the condition on line 254 was never false
255 substitutes[key] = self.commandLineDefaults[key]
256 template.rewrite(resolvedInputName, output, substitutes)
257 return output
259 def isVerbose(self):
260 """Status of the verbose flag
261 @return True if the flag was set, False otherwise
262 """
263 return self.opts.verbose
265 def getUserName(self):
266 """Accessor for USER_NAME
267 @return the value of USER_NAME
268 """
269 return self.getParameter("USER_NAME")
271 def getUserHome(self):
272 """Accessor for USER_HOME
273 @return the value of USER_HOME
274 """
275 return self.getParameter("USER_HOME")
277 def getHostName(self):
278 """Accessor for HOST_NAME
279 @return the value of HOST_NAME
280 """
281 return self.getParameter("HOST_NAME")
283 def getUtilityPath(self):
284 """Accessor for UTILITY_PATH
285 @return the value of UTILITY_PATH
286 """
287 return self.getParameter("UTILITY_PATH")
289 def getScratchDirectory(self):
290 """Accessor for SCRATCH_DIR
291 @return the value of SCRATCH_DIR
292 """
293 return self.getParameter("SCRATCH_DIR")
295 def getLocalScratchDirectory(self):
296 """Accessor for LOCAL_SCRATCH
297 @return the value of LOCAL_SCRATCH
298 """
299 return self.getParameter("LOCAL_SCRATCH")
301 def getNodeSetName(self):
302 """Accessor for NODE_SET
303 @return the value of NODE_SET
304 """
305 return self.getParameter("NODE_SET")
307 def getNodes(self):
308 """Accessor for NODE_COUNT
309 @return the value of NODE_COUNT
310 """
311 return self.getParameter("NODE_COUNT")
313 def getCPUs(self):
314 """Accessor for CPUS
315 @return the value of CPUS
316 """
317 return self.getParameter("CPUS")
319 def getWallClock(self):
320 """Accessor for WALL_CLOCK
321 @return the value of WALL_CLOCK
322 """
323 return self.getParameter("WALL_CLOCK")
325 def getScheduler(self):
326 """Accessor for SCHEDULER
327 @return the value of SCHEDULER
328 """
329 return self.getParameter("SCHEDULER")
331 def getReservation(self):
332 """Accessor for RESERVATION
333 @return the value of RESERVATION
334 """
335 return self.getParameter("RESERVATION")
337 def getParameter(self, value):
338 """Accessor for generic value
339 @return None if value is not set. Otherwise, use the command line
340 override (if set), or the default Config value
341 """
342 if value in self.commandLineDefaults:
343 return self.commandLineDefaults[value]
344 if value in self.defaults:
345 return self.defaults[value]
346 return None
348 def printNodeSetInfo(self):
349 nodes = self.getNodes()
350 cpus = self.getCPUs()
351 wallClock = self.getWallClock()
352 nodeString = ""
354 if int(nodes) > 1:
355 nodeString = "s"
356 if self.opts.dynamic is None:
357 print("%s node%s will be allocated on %s with %s cpus per node and maximum time limit of %s" %
358 (nodes, nodeString, self.platform, cpus, wallClock))
359 elif self.opts.dynamic == '__default__':
360 print("%s node%s will be allocated on %s using default dynamic slots configuration \
361with %s cpus per node and maximum time limit of %s" %
362 (nodes, nodeString, self.platform, cpus, wallClock))
363 else:
364 print("%s node%s will be allocated on %s using dynamic slot block specified in \
365'%s' with %s cpus per node and maximum time limit of %s" %
366 (nodes, nodeString, self.platform, self.opts.dynamic, cpus, wallClock))
367 print("Node set name:")
368 print(self.getNodeSetName())
370 def runCommand(self, cmd, verbose):
371 cmd_split = cmd.split()
372 pid = os.fork()
373 if not pid:
374 # Methods of file transfer and login may
375 # produce different output, depending on how
376 # the "gsi" utilities are used. The user can
377 # either use grid proxies or ssh, and gsiscp/gsissh
378 # does the right thing. Since the output will be
379 # different in either case anything potentially parsing this
380 # output (like drpRun), would have to go through extra
381 # steps to deal with this output, and which ultimately
382 # end up not being useful. So we optinally close the i/o output
383 # of the executing command down.
384 #
385 # stdin/stdio/stderr is treated specially
386 # by python, so we have to close down
387 # both the python objects and the
388 # underlying c implementations
389 if not verbose:
390 # close python i/o
391 sys.stdin.close()
392 sys.stdout.close()
393 sys.stderr.close()
394 # close C's i/o
395 os.close(0)
396 os.close(1)
397 os.close(2)
398 os.execvp(cmd_split[0], cmd_split)
399 pid, status = os.wait()
400 # high order bits are status, low order bits are signal.
401 exitCode = (status & 0xff00) >> 8
402 return exitCode