Coverage for python/lsst/ctrl/execute/allocator.py: 68%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

175 statements  

1#!/usr/bin/env python 

2 

3# 

4# LSST Data Management System 

5# Copyright 2008-2016 LSST Corporation. 

6# 

7# This product includes software developed by the 

8# LSST Project (http://www.lsst.org/). 

9# 

10# This program is free software: you can redistribute it and/or modify 

11# it under the terms of the GNU General Public License as published by 

12# the Free Software Foundation, either version 3 of the License, or 

13# (at your option) any later version. 

14# 

15# This program is distributed in the hope that it will be useful, 

16# but WITHOUT ANY WARRANTY; without even the implied warranty of 

17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

18# GNU General Public License for more details. 

19# 

20# You should have received a copy of the LSST License Statement and 

21# the GNU General Public License along with this program. If not, 

22# see <http://www.lsstcorp.org/LegalNotices/>. 

23# 

24 

25import os 

26import sys 

27import pwd 

28from datetime import datetime 

29from string import Template 

30from lsst.ctrl.execute import envString 

31from lsst.ctrl.execute.allocationConfig import AllocationConfig 

32from lsst.ctrl.execute.condorInfoConfig import CondorInfoConfig 

33from lsst.ctrl.execute.templateWriter import TemplateWriter 

34from lsst.ctrl.execute.seqFile import SeqFile 

35 

36 

37class Allocator: 

38 """A class which consolidates allocation pex_config information with 

39 override information (obtained from the command line) and produces a 

40 PBS file using these values. 

41 

42 Parameters 

43 ---------- 

44 platform : `str` 

45 the name of the platform to execute on 

46 opts : `Config` 

47 Config object containing options 

48 condorInfoFileName : `str` 

49 Name of the file containing Config information 

50 """ 

51 

52 def __init__(self, platform, opts, configuration, condorInfoFileName): 

53 """Constructor 

54 @param platform: target platform for PBS submission 

55 @param opts: options to override 

56 """ 

57 self.opts = opts 

58 self.defaults = {} 

59 self.configuration = configuration 

60 

61 fileName = envString.resolve(condorInfoFileName) 

62 condorInfoConfig = CondorInfoConfig() 

63 condorInfoConfig.load(fileName) 

64 

65 self.platform = platform 

66 

67 # Look up the user's name and home directory in the 

68 # $HOME/.lsst/condor-info.py file 

69 # If the platform is lsst, and the user_name or user_home 

70 # is not in there, then default to user running this 

71 # command and the value of $HOME, respectively. 

72 user_name = None 

73 user_home = None 

74 for name in condorInfoConfig.platform: 

75 if name == self.platform: 

76 user_name = condorInfoConfig.platform[name].user.name 

77 user_home = condorInfoConfig.platform[name].user.home 

78 

79 if self.platform == "lsst": 79 ↛ 85line 79 didn't jump to line 85, because the condition on line 79 was never false

80 if user_name is None: 80 ↛ 81line 80 didn't jump to line 81, because the condition on line 80 was never true

81 user_name = pwd.getpwuid(os.geteuid()).pw_name 

82 if user_home is None: 82 ↛ 83line 82 didn't jump to line 83, because the condition on line 82 was never true

83 user_home = os.getenv('HOME') 

84 

85 if user_name is None: 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true

86 raise RuntimeError("error: %s does not specify user name for platform == %s" % 

87 (condorInfoFileName, self.platform)) 

88 if user_home is None: 88 ↛ 89line 88 didn't jump to line 89, because the condition on line 88 was never true

89 raise RuntimeError("error: %s does not specify user home for platform == %s" % 

90 (condorInfoFileName, self.platform)) 

91 

92 self.defaults["USER_NAME"] = user_name 

93 self.defaults["USER_HOME"] = user_home 

94 

95 self.commandLineDefaults = {} 

96 

97 self.commandLineDefaults["NODE_COUNT"] = self.opts.nodeCount 

98 self.commandLineDefaults["CPUS"] = self.opts.cpus 

99 self.commandLineDefaults["WALL_CLOCK"] = self.opts.maximumWallClock 

100 

101 self.commandLineDefaults["QUEUE"] = self.opts.queue 

102 if self.opts.email == "no": 102 ↛ 103line 102 didn't jump to line 103, because the condition on line 102 was never true

103 self.commandLineDefaults["EMAIL_NOTIFICATION"] = "#" 

104 

105 self.load() 

106 

107 def createNodeSetName(self): 

108 """Creates the next "node_set" name, using the remote user name and 

109 a stored sequence number. 

110 

111 Returns 

112 ------- 

113 nodeSetName : `str` 

114 the new node_set name 

115 """ 

116 s = SeqFile("$HOME/.lsst/node-set.seq") 

117 n = s.nextSeq() 

118 nodeSetName = "%s_%d" % (self.defaults["USER_NAME"], n) 

119 return nodeSetName 

120 

121 def createUniqueIdentifier(self): 

122 """Creates a unique file identifier, based on the user's name 

123 and the time at which this method is invoked. 

124 

125 Returns 

126 ------- 

127 ident : `str` 

128 the new identifier 

129 """ 

130 # This naming scheme follows the conventions used for creating 

131 # RUNID names. We've found this allows these files to be more 

132 # easily located and shared with other users when debugging 

133 # The tempfile.mkstemp method restricts the file to only the user, 

134 # and does not guarantee a file name can that easily be identified. 

135 now = datetime.now() 

136 username = pwd.getpwuid(os.geteuid()).pw_name 

137 ident = "%s_%02d_%02d%02d_%02d%02d%02d" % ( 

138 username, now.year, now.month, now.day, now.hour, now.minute, now.second) 

139 return ident 

140 

141 def load(self): 

142 """Loads all values from configuration and command line overrides into 

143 data structures suitable for use by the TemplateWriter object. 

144 """ 

145 tempLocalScratch = Template(self.configuration.platform.localScratch) 

146 self.defaults["LOCAL_SCRATCH"] = tempLocalScratch.substitute(USER_NAME=self.defaults["USER_NAME"]) 

147 # print("localScratch-> %s" % self.defaults["LOCAL_SCRATCH"]) 

148 self.defaults["SCHEDULER"] = self.configuration.platform.scheduler 

149 

150 def loadAllocationConfig(self, name, suffix): 

151 """Loads all values from allocationConfig and command line overrides 

152 into data structures suitable for use by the TemplateWriter object. 

153 """ 

154 resolvedName = envString.resolve(name) 

155 allocationConfig = AllocationConfig() 

156 if not os.path.exists(resolvedName): 156 ↛ 157line 156 didn't jump to line 157, because the condition on line 156 was never true

157 raise RuntimeError("%s was not found." % resolvedName) 

158 allocationConfig.load(resolvedName) 

159 

160 self.defaults["QUEUE"] = allocationConfig.platform.queue 

161 self.defaults["EMAIL_NOTIFICATION"] = allocationConfig.platform.email 

162 self.defaults["HOST_NAME"] = allocationConfig.platform.loginHostName 

163 

164 self.defaults["UTILITY_PATH"] = allocationConfig.platform.utilityPath 

165 

166 if self.opts.glideinShutdown is None: 166 ↛ 169line 166 didn't jump to line 169, because the condition on line 166 was never false

167 self.defaults["GLIDEIN_SHUTDOWN"] = str(allocationConfig.platform.glideinShutdown) 

168 else: 

169 self.defaults["GLIDEIN_SHUTDOWN"] = str(self.opts.glideinShutdown) 

170 

171 if self.opts.nodeSet is None: 171 ↛ 172line 171 didn't jump to line 172, because the condition on line 171 was never true

172 self.defaults["NODE_SET"] = self.createNodeSetName() 

173 else: 

174 self.defaults["NODE_SET"] = self.opts.nodeSet 

175 

176 nodeSetName = self.defaults["NODE_SET"] 

177 

178 if self.opts.outputLog is not None: 178 ↛ 181line 178 didn't jump to line 181, because the condition on line 178 was never false

179 self.defaults["OUTPUT_LOG"] = self.opts.outputLog 

180 else: 

181 self.defaults["OUTPUT_LOG"] = "%s.out" % nodeSetName 

182 

183 if self.opts.errorLog is not None: 183 ↛ 186line 183 didn't jump to line 186, because the condition on line 183 was never false

184 self.defaults["ERROR_LOG"] = self.opts.errorLog 

185 else: 

186 self.defaults["ERROR_LOG"] = "%s.err" % nodeSetName 

187 

188 # This is the TOTAL number of cores in the job, not just the total 

189 # of the cores you intend to use. In other words, the total available 

190 # on a machine, times the number of machines. 

191 totalCoresPerNode = allocationConfig.platform.totalCoresPerNode 

192 self.commandLineDefaults["TOTAL_CORE_COUNT"] = self.opts.nodeCount * totalCoresPerNode 

193 

194 self.uniqueIdentifier = self.createUniqueIdentifier() 

195 

196 # write these pbs and config files to {LOCAL_DIR}/configs 

197 self.configDir = os.path.join(self.defaults["LOCAL_SCRATCH"], "configs") 

198 if not os.path.exists(self.configDir): 198 ↛ 201line 198 didn't jump to line 201, because the condition on line 198 was never false

199 os.makedirs(self.configDir) 

200 

201 self.submitFileName = os.path.join(self.configDir, "alloc_%s.%s" % (self.uniqueIdentifier, suffix)) 

202 

203 self.condorConfigFileName = os.path.join(self.configDir, "condor_%s.config" % self.uniqueIdentifier) 

204 

205 self.defaults["GENERATED_CONFIG"] = os.path.basename(self.condorConfigFileName) 

206 self.defaults["CONFIGURATION_ID"] = self.uniqueIdentifier 

207 return allocationConfig 

208 

209 def createSubmitFile(self, inputFile): 

210 """Creates a PBS file using the file "input" as a Template 

211 

212 Returns 

213 ------- 

214 outfile : `str` 

215 The newly created file name 

216 """ 

217 outfile = self.createFile(inputFile, self.submitFileName) 

218 if self.opts.verbose: 218 ↛ 219line 218 didn't jump to line 219, because the condition on line 218 was never true

219 print("wrote new PBS file to %s" % outfile) 

220 return outfile 

221 

222 def createCondorConfigFile(self, input): 

223 """Creates a Condor config file using the file "input" as a Template 

224 

225 Returns 

226 ------- 

227 outfile : `str` 

228 The newly created file name 

229 """ 

230 outfile = self.createFile(input, self.condorConfigFileName) 

231 if self.opts.verbose: 231 ↛ 232line 231 didn't jump to line 232, because the condition on line 231 was never true

232 print("wrote new condor_config file to %s" % outfile) 

233 return outfile 

234 

235 def createFile(self, input, output): 

236 """Creates a new file, using "input" as a Template, and writes the 

237 new file to output. 

238 

239 Returns 

240 ------- 

241 outfile : `str` 

242 The newly created file name 

243 """ 

244 resolvedInputName = envString.resolve(input) 

245 if self.opts.verbose: 245 ↛ 246line 245 didn't jump to line 246, because the condition on line 245 was never true

246 print("creating file using %s" % resolvedInputName) 

247 template = TemplateWriter() 

248 # Uses the associative arrays of "defaults" and "commandLineDefaults" 

249 # to write out the new file from the template. 

250 # The commandLineDefaults override values in "defaults" 

251 substitutes = self.defaults.copy() 

252 for key in self.commandLineDefaults: 

253 val = self.commandLineDefaults[key] 

254 if val is not None: 254 ↛ 252line 254 didn't jump to line 252, because the condition on line 254 was never false

255 substitutes[key] = self.commandLineDefaults[key] 

256 template.rewrite(resolvedInputName, output, substitutes) 

257 return output 

258 

259 def isVerbose(self): 

260 """Status of the verbose flag 

261 @return True if the flag was set, False otherwise 

262 """ 

263 return self.opts.verbose 

264 

265 def getUserName(self): 

266 """Accessor for USER_NAME 

267 @return the value of USER_NAME 

268 """ 

269 return self.getParameter("USER_NAME") 

270 

271 def getUserHome(self): 

272 """Accessor for USER_HOME 

273 @return the value of USER_HOME 

274 """ 

275 return self.getParameter("USER_HOME") 

276 

277 def getHostName(self): 

278 """Accessor for HOST_NAME 

279 @return the value of HOST_NAME 

280 """ 

281 return self.getParameter("HOST_NAME") 

282 

283 def getUtilityPath(self): 

284 """Accessor for UTILITY_PATH 

285 @return the value of UTILITY_PATH 

286 """ 

287 return self.getParameter("UTILITY_PATH") 

288 

289 def getScratchDirectory(self): 

290 """Accessor for SCRATCH_DIR 

291 @return the value of SCRATCH_DIR 

292 """ 

293 return self.getParameter("SCRATCH_DIR") 

294 

295 def getLocalScratchDirectory(self): 

296 """Accessor for LOCAL_SCRATCH 

297 @return the value of LOCAL_SCRATCH 

298 """ 

299 return self.getParameter("LOCAL_SCRATCH") 

300 

301 def getNodeSetName(self): 

302 """Accessor for NODE_SET 

303 @return the value of NODE_SET 

304 """ 

305 return self.getParameter("NODE_SET") 

306 

307 def getNodes(self): 

308 """Accessor for NODE_COUNT 

309 @return the value of NODE_COUNT 

310 """ 

311 return self.getParameter("NODE_COUNT") 

312 

313 def getCPUs(self): 

314 """Accessor for CPUS 

315 @return the value of CPUS 

316 """ 

317 return self.getParameter("CPUS") 

318 

319 def getWallClock(self): 

320 """Accessor for WALL_CLOCK 

321 @return the value of WALL_CLOCK 

322 """ 

323 return self.getParameter("WALL_CLOCK") 

324 

325 def getScheduler(self): 

326 """Accessor for SCHEDULER 

327 @return the value of SCHEDULER 

328 """ 

329 return self.getParameter("SCHEDULER") 

330 

331 def getReservation(self): 

332 """Accessor for RESERVATION 

333 @return the value of RESERVATION 

334 """ 

335 return self.getParameter("RESERVATION") 

336 

337 def getParameter(self, value): 

338 """Accessor for generic value 

339 @return None if value is not set. Otherwise, use the command line 

340 override (if set), or the default Config value 

341 """ 

342 if value in self.commandLineDefaults: 

343 return self.commandLineDefaults[value] 

344 if value in self.defaults: 

345 return self.defaults[value] 

346 return None 

347 

348 def printNodeSetInfo(self): 

349 nodes = self.getNodes() 

350 cpus = self.getCPUs() 

351 wallClock = self.getWallClock() 

352 nodeString = "" 

353 

354 if int(nodes) > 1: 

355 nodeString = "s" 

356 if self.opts.dynamic is None: 

357 print("%s node%s will be allocated on %s with %s cpus per node and maximum time limit of %s" % 

358 (nodes, nodeString, self.platform, cpus, wallClock)) 

359 elif self.opts.dynamic == '__default__': 

360 print("%s node%s will be allocated on %s using default dynamic slots configuration \ 

361with %s cpus per node and maximum time limit of %s" % 

362 (nodes, nodeString, self.platform, cpus, wallClock)) 

363 else: 

364 print("%s node%s will be allocated on %s using dynamic slot block specified in \ 

365'%s' with %s cpus per node and maximum time limit of %s" % 

366 (nodes, nodeString, self.platform, self.opts.dynamic, cpus, wallClock)) 

367 print("Node set name:") 

368 print(self.getNodeSetName()) 

369 

370 def runCommand(self, cmd, verbose): 

371 cmd_split = cmd.split() 

372 pid = os.fork() 

373 if not pid: 

374 # Methods of file transfer and login may 

375 # produce different output, depending on how 

376 # the "gsi" utilities are used. The user can 

377 # either use grid proxies or ssh, and gsiscp/gsissh 

378 # does the right thing. Since the output will be 

379 # different in either case anything potentially parsing this 

380 # output (like drpRun), would have to go through extra 

381 # steps to deal with this output, and which ultimately 

382 # end up not being useful. So we optinally close the i/o output 

383 # of the executing command down. 

384 # 

385 # stdin/stdio/stderr is treated specially 

386 # by python, so we have to close down 

387 # both the python objects and the 

388 # underlying c implementations 

389 if not verbose: 

390 # close python i/o 

391 sys.stdin.close() 

392 sys.stdout.close() 

393 sys.stderr.close() 

394 # close C's i/o 

395 os.close(0) 

396 os.close(1) 

397 os.close(2) 

398 os.execvp(cmd_split[0], cmd_split) 

399 pid, status = os.wait() 

400 # high order bits are status, low order bits are signal. 

401 exitCode = (status & 0xff00) >> 8 

402 return exitCode