1 | #Licensed to the Apache Software Foundation (ASF) under one |
---|
2 | #or more contributor license agreements. See the NOTICE file |
---|
3 | #distributed with this work for additional information |
---|
4 | #regarding copyright ownership. The ASF licenses this file |
---|
5 | #to you under the Apache License, Version 2.0 (the |
---|
6 | #"License"); you may not use this file except in compliance |
---|
7 | #with the License. You may obtain a copy of the License at |
---|
8 | |
---|
9 | # http://www.apache.org/licenses/LICENSE-2.0 |
---|
10 | |
---|
11 | #Unless required by applicable law or agreed to in writing, software |
---|
12 | #distributed under the License is distributed on an "AS IS" BASIS, |
---|
13 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
---|
14 | #See the License for the specific language governing permissions and |
---|
15 | #limitations under the License. |
---|
16 | # -*- python -*- |
---|
17 | |
---|
18 | import sys, os, getpass, pprint, re, cPickle, random, shutil, time, errno |
---|
19 | |
---|
20 | import hodlib.Common.logger |
---|
21 | |
---|
22 | from hodlib.ServiceRegistry.serviceRegistry import svcrgy |
---|
23 | from hodlib.Common.xmlrpc import hodXRClient |
---|
24 | from hodlib.Common.util import to_http_url, get_exception_string |
---|
25 | from hodlib.Common.util import get_exception_error_string |
---|
26 | from hodlib.Common.util import hodInterrupt, HodInterruptException |
---|
27 | from hodlib.Common.util import HOD_INTERRUPTED_CODE |
---|
28 | |
---|
29 | from hodlib.Common.nodepoolutil import NodePoolUtil |
---|
30 | from hodlib.Hod.hadoop import hadoopCluster, hadoopScript |
---|
31 | |
---|
32 | CLUSTER_DATA_FILE = 'clusters' |
---|
33 | INVALID_STATE_FILE_MSGS = \ |
---|
34 | [ |
---|
35 | |
---|
36 | "Requested operation cannot be performed. Cannot read %s: " + \ |
---|
37 | "Permission denied.", |
---|
38 | |
---|
39 | "Requested operation cannot be performed. " + \ |
---|
40 | "Cannot write to %s: Permission denied.", |
---|
41 | |
---|
42 | "Requested operation cannot be performed. " + \ |
---|
43 | "Cannot read/write to %s: Permission denied.", |
---|
44 | |
---|
45 | "Cannot update %s: Permission denied. " + \ |
---|
46 | "Cluster is deallocated, but info and list " + \ |
---|
47 | "operations might show incorrect information.", |
---|
48 | |
---|
49 | ] |
---|
50 | |
---|
51 | class hodState: |
---|
52 | def __init__(self, store): |
---|
53 | self.__store = store |
---|
54 | self.__stateFile = None |
---|
55 | self.__init_store() |
---|
56 | self.__STORE_EXT = ".state" |
---|
57 | |
---|
58 | def __init_store(self): |
---|
59 | if not os.path.exists(self.__store): |
---|
60 | os.mkdir(self.__store) |
---|
61 | |
---|
62 | def __set_state_file(self, id=None): |
---|
63 | if id: |
---|
64 | self.__stateFile = os.path.join(self.__store, "%s%s" % (id, |
---|
65 | self.__STORE_EXT)) |
---|
66 | else: |
---|
67 | for item in os.listdir(self.__store): |
---|
68 | if item.endswith(self.__STORE_EXT): |
---|
69 | self.__stateFile = os.path.join(self.__store, item) |
---|
70 | |
---|
71 | def get_state_file(self): |
---|
72 | return self.__stateFile |
---|
73 | |
---|
74 | def checkStateFile(self, id=None, modes=(os.R_OK,)): |
---|
75 | # is state file exists/readable/writable/both? |
---|
76 | self.__set_state_file(id) |
---|
77 | |
---|
78 | # return true if file doesn't exist, because HOD CAN create |
---|
79 | # state file and so WILL have permissions to read and/or write |
---|
80 | try: |
---|
81 | os.stat(self.__stateFile) |
---|
82 | except OSError, err: |
---|
83 | if err.errno == errno.ENOENT: # error 2 (no such file) |
---|
84 | return True |
---|
85 | |
---|
86 | # file exists |
---|
87 | ret = True |
---|
88 | for mode in modes: |
---|
89 | ret = ret and os.access(self.__stateFile, mode) |
---|
90 | return ret |
---|
91 | |
---|
92 | def read(self, id=None): |
---|
93 | info = {} |
---|
94 | |
---|
95 | self.__set_state_file(id) |
---|
96 | |
---|
97 | if self.__stateFile: |
---|
98 | if os.path.isfile(self.__stateFile): |
---|
99 | stateFile = open(self.__stateFile, 'r') |
---|
100 | try: |
---|
101 | info = cPickle.load(stateFile) |
---|
102 | except EOFError: |
---|
103 | pass |
---|
104 | |
---|
105 | stateFile.close() |
---|
106 | |
---|
107 | return info |
---|
108 | |
---|
109 | def write(self, id, info): |
---|
110 | self.__set_state_file(id) |
---|
111 | if not os.path.exists(self.__stateFile): |
---|
112 | self.clear(id) |
---|
113 | |
---|
114 | stateFile = open(self.__stateFile, 'w') |
---|
115 | cPickle.dump(info, stateFile) |
---|
116 | stateFile.close() |
---|
117 | |
---|
118 | def clear(self, id=None): |
---|
119 | self.__set_state_file(id) |
---|
120 | if self.__stateFile and os.path.exists(self.__stateFile): |
---|
121 | os.remove(self.__stateFile) |
---|
122 | else: |
---|
123 | for item in os.listdir(self.__store): |
---|
124 | if item.endswith(self.__STORE_EXT): |
---|
125 | os.remove(item) |
---|
126 | |
---|
127 | class hodRunner: |
---|
128 | |
---|
129 | def __init__(self, cfg, log=None, cluster=None): |
---|
130 | self.__hodhelp = hodHelp() |
---|
131 | self.__ops = self.__hodhelp.ops |
---|
132 | self.__cfg = cfg |
---|
133 | self.__npd = self.__cfg['nodepooldesc'] |
---|
134 | self.__opCode = 0 |
---|
135 | self.__user = getpass.getuser() |
---|
136 | self.__registry = None |
---|
137 | self.__baseLogger = None |
---|
138 | # Allowing to pass in log object to help testing - a stub can be passed in |
---|
139 | if log is None: |
---|
140 | self.__setup_logger() |
---|
141 | else: |
---|
142 | self.__log = log |
---|
143 | |
---|
144 | self.__userState = hodState(self.__cfg['hod']['user_state']) |
---|
145 | |
---|
146 | self.__clusterState = None |
---|
147 | self.__clusterStateInfo = { 'env' : None, 'hdfs' : None, 'mapred' : None } |
---|
148 | |
---|
149 | # Allowing to pass in log object to help testing - a stib can be passed in |
---|
150 | if cluster is None: |
---|
151 | self.__cluster = hadoopCluster(self.__cfg, self.__log) |
---|
152 | else: |
---|
153 | self.__cluster = cluster |
---|
154 | |
---|
155 | def __setup_logger(self): |
---|
156 | self.__baseLogger = hodlib.Common.logger.hodLog('hod') |
---|
157 | self.__log = self.__baseLogger.add_logger(self.__user ) |
---|
158 | |
---|
159 | if self.__cfg['hod']['stream']: |
---|
160 | self.__baseLogger.add_stream(level=self.__cfg['hod']['debug'], |
---|
161 | addToLoggerNames=(self.__user ,)) |
---|
162 | |
---|
163 | if self.__cfg['hod'].has_key('syslog-address'): |
---|
164 | self.__baseLogger.add_syslog(self.__cfg['hod']['syslog-address'], |
---|
165 | level=self.__cfg['hod']['debug'], |
---|
166 | addToLoggerNames=(self.__user ,)) |
---|
167 | |
---|
168 | def get_logger(self): |
---|
169 | return self.__log |
---|
170 | |
---|
171 | def __setup_cluster_logger(self, directory): |
---|
172 | self.__baseLogger.add_file(logDirectory=directory, level=4, |
---|
173 | backupCount=self.__cfg['hod']['log-rollover-count'], |
---|
174 | addToLoggerNames=(self.__user ,)) |
---|
175 | |
---|
176 | def __setup_cluster_state(self, directory): |
---|
177 | self.__clusterState = hodState(directory) |
---|
178 | |
---|
179 | def __norm_cluster_dir(self, directory): |
---|
180 | directory = os.path.expanduser(directory) |
---|
181 | if not os.path.isabs(directory): |
---|
182 | directory = os.path.join(self.__cfg['hod']['original-dir'], directory) |
---|
183 | directory = os.path.abspath(directory) |
---|
184 | |
---|
185 | return directory |
---|
186 | |
---|
187 | def __setup_service_registry(self): |
---|
188 | cfg = self.__cfg['hod'].copy() |
---|
189 | cfg['debug'] = 0 |
---|
190 | self.__registry = svcrgy(cfg, self.__log) |
---|
191 | self.__registry.start() |
---|
192 | self.__log.debug(self.__registry.getXMLRPCAddr()) |
---|
193 | self.__cfg['hod']['xrs-address'] = self.__registry.getXMLRPCAddr() |
---|
194 | self.__cfg['ringmaster']['svcrgy-addr'] = self.__cfg['hod']['xrs-address'] |
---|
195 | |
---|
196 | def __set_cluster_state_info(self, env, hdfs, mapred, ring, jobid, min, max): |
---|
197 | self.__clusterStateInfo['env'] = env |
---|
198 | self.__clusterStateInfo['hdfs'] = "http://%s" % hdfs |
---|
199 | self.__clusterStateInfo['mapred'] = "http://%s" % mapred |
---|
200 | self.__clusterStateInfo['ring'] = ring |
---|
201 | self.__clusterStateInfo['jobid'] = jobid |
---|
202 | self.__clusterStateInfo['min'] = min |
---|
203 | self.__clusterStateInfo['max'] = max |
---|
204 | |
---|
205 | def __set_user_state_info(self, info): |
---|
206 | userState = self.__userState.read(CLUSTER_DATA_FILE) |
---|
207 | for key in info.keys(): |
---|
208 | userState[key] = info[key] |
---|
209 | |
---|
210 | self.__userState.write(CLUSTER_DATA_FILE, userState) |
---|
211 | |
---|
212 | def __remove_cluster(self, clusterDir): |
---|
213 | clusterInfo = self.__userState.read(CLUSTER_DATA_FILE) |
---|
214 | if clusterDir in clusterInfo: |
---|
215 | del(clusterInfo[clusterDir]) |
---|
216 | self.__userState.write(CLUSTER_DATA_FILE, clusterInfo) |
---|
217 | |
---|
218 | def __cleanup(self): |
---|
219 | if self.__registry: self.__registry.stop() |
---|
220 | |
---|
221 | def __check_operation(self, operation): |
---|
222 | opList = operation.split() |
---|
223 | |
---|
224 | if not opList[0] in self.__ops: |
---|
225 | self.__log.critical("Invalid hod operation specified: %s" % operation) |
---|
226 | self._op_help(None) |
---|
227 | self.__opCode = 2 |
---|
228 | |
---|
229 | return opList |
---|
230 | |
---|
231 | def __adjustMasterFailureCountConfig(self, nodeCount): |
---|
232 | # This method adjusts the ringmaster.max-master-failures variable |
---|
233 | # to a value that is bounded by the a function of the number of |
---|
234 | # nodes. |
---|
235 | |
---|
236 | maxFailures = self.__cfg['ringmaster']['max-master-failures'] |
---|
237 | # Count number of masters required - depends on which services |
---|
238 | # are external |
---|
239 | masters = 0 |
---|
240 | if not self.__cfg['gridservice-hdfs']['external']: |
---|
241 | masters += 1 |
---|
242 | if not self.__cfg['gridservice-mapred']['external']: |
---|
243 | masters += 1 |
---|
244 | |
---|
245 | # So, if there are n nodes and m masters, we look atleast for |
---|
246 | # all masters to come up. Therefore, atleast m nodes should be |
---|
247 | # good, which means a maximum of n-m master nodes can fail. |
---|
248 | maxFailedNodes = nodeCount - masters |
---|
249 | |
---|
250 | # The configured max number of failures is now bounded by this |
---|
251 | # number. |
---|
252 | self.__cfg['ringmaster']['max-master-failures'] = \ |
---|
253 | min(maxFailures, maxFailedNodes) |
---|
254 | |
---|
255 | def _op_allocate(self, args): |
---|
256 | operation = "allocate" |
---|
257 | argLength = len(args) |
---|
258 | min = 0 |
---|
259 | max = 0 |
---|
260 | errorFlag = False |
---|
261 | errorMsgs = [] |
---|
262 | |
---|
263 | if argLength == 3: |
---|
264 | nodes = args[2] |
---|
265 | clusterDir = self.__norm_cluster_dir(args[1]) |
---|
266 | |
---|
267 | if not os.path.exists(clusterDir): |
---|
268 | try: |
---|
269 | os.makedirs(clusterDir) |
---|
270 | except OSError, err: |
---|
271 | errorFlag = True |
---|
272 | errorMsgs.append("Could not create cluster directory. %s" \ |
---|
273 | % (str(err))) |
---|
274 | elif not os.path.isdir(clusterDir): |
---|
275 | errorFlag = True |
---|
276 | errorMsgs.append( \ |
---|
277 | "Invalid cluster directory (--hod.clusterdir or -d) : " + \ |
---|
278 | clusterDir + " : Not a directory") |
---|
279 | |
---|
280 | if int(nodes) < 3 : |
---|
281 | errorFlag = True |
---|
282 | errorMsgs.append("Invalid nodecount (--hod.nodecount or -n) : " + \ |
---|
283 | "Must be >= 3. Given nodes: %s" % nodes) |
---|
284 | if errorFlag: |
---|
285 | for msg in errorMsgs: |
---|
286 | self.__log.critical(msg) |
---|
287 | self.__opCode = 3 |
---|
288 | return |
---|
289 | |
---|
290 | if not self.__userState.checkStateFile(CLUSTER_DATA_FILE, \ |
---|
291 | (os.R_OK, os.W_OK)): |
---|
292 | self.__log.critical(INVALID_STATE_FILE_MSGS[2] % \ |
---|
293 | self.__userState.get_state_file()) |
---|
294 | self.__opCode = 1 |
---|
295 | return |
---|
296 | |
---|
297 | clusterList = self.__userState.read(CLUSTER_DATA_FILE) |
---|
298 | if clusterDir in clusterList.keys(): |
---|
299 | self.__setup_cluster_state(clusterDir) |
---|
300 | clusterInfo = self.__clusterState.read() |
---|
301 | # Check if the job is not running. Only then can we safely |
---|
302 | # allocate another cluster. Otherwise the user would need |
---|
303 | # to deallocate and free up resources himself. |
---|
304 | if clusterInfo.has_key('jobid') and \ |
---|
305 | self.__cluster.is_cluster_deallocated(clusterInfo['jobid']): |
---|
306 | self.__log.warn("Found a dead cluster at cluster directory '%s'. Deallocating it to allocate a new one." % (clusterDir)) |
---|
307 | self.__remove_cluster(clusterDir) |
---|
308 | self.__clusterState.clear() |
---|
309 | else: |
---|
310 | self.__log.critical("Found a previously allocated cluster at cluster directory '%s'. HOD cannot determine if this cluster can be automatically deallocated. Deallocate the cluster if it is unused." % (clusterDir)) |
---|
311 | self.__opCode = 12 |
---|
312 | return |
---|
313 | |
---|
314 | self.__setup_cluster_logger(clusterDir) |
---|
315 | |
---|
316 | (status, message) = self.__cluster.is_valid_account() |
---|
317 | if status is not 0: |
---|
318 | if message: |
---|
319 | for line in message: |
---|
320 | self.__log.critical("verify-account output: %s" % line) |
---|
321 | self.__log.critical("Cluster cannot be allocated because account verification failed. " \ |
---|
322 | + "verify-account returned exit code: %s." % status) |
---|
323 | self.__opCode = 4 |
---|
324 | return |
---|
325 | else: |
---|
326 | self.__log.debug("verify-account returned zero exit code.") |
---|
327 | if message: |
---|
328 | self.__log.debug("verify-account output: %s" % message) |
---|
329 | |
---|
330 | if re.match('\d+-\d+', nodes): |
---|
331 | (min, max) = nodes.split("-") |
---|
332 | min = int(min) |
---|
333 | max = int(max) |
---|
334 | else: |
---|
335 | try: |
---|
336 | nodes = int(nodes) |
---|
337 | min = nodes |
---|
338 | max = nodes |
---|
339 | except ValueError: |
---|
340 | print self.__hodhelp.help(operation) |
---|
341 | self.__log.critical( |
---|
342 | "%s operation requires a pos_int value for n(nodecount)." % |
---|
343 | operation) |
---|
344 | self.__opCode = 3 |
---|
345 | else: |
---|
346 | self.__setup_cluster_state(clusterDir) |
---|
347 | clusterInfo = self.__clusterState.read() |
---|
348 | self.__opCode = self.__cluster.check_cluster(clusterInfo) |
---|
349 | if self.__opCode == 0 or self.__opCode == 15: |
---|
350 | self.__setup_service_registry() |
---|
351 | if hodInterrupt.isSet(): |
---|
352 | self.__cleanup() |
---|
353 | raise HodInterruptException() |
---|
354 | self.__log.debug("Service Registry started.") |
---|
355 | |
---|
356 | self.__adjustMasterFailureCountConfig(nodes) |
---|
357 | |
---|
358 | try: |
---|
359 | allocateStatus = self.__cluster.allocate(clusterDir, min, max) |
---|
360 | except HodInterruptException, h: |
---|
361 | self.__cleanup() |
---|
362 | raise h |
---|
363 | # Allocation has gone through. |
---|
364 | # Don't care about interrupts any more |
---|
365 | |
---|
366 | try: |
---|
367 | if allocateStatus == 0: |
---|
368 | self.__set_cluster_state_info(os.environ, |
---|
369 | self.__cluster.hdfsInfo, |
---|
370 | self.__cluster.mapredInfo, |
---|
371 | self.__cluster.ringmasterXRS, |
---|
372 | self.__cluster.jobId, |
---|
373 | min, max) |
---|
374 | self.__setup_cluster_state(clusterDir) |
---|
375 | self.__clusterState.write(self.__cluster.jobId, |
---|
376 | self.__clusterStateInfo) |
---|
377 | # Do we need to check for interrupts here ?? |
---|
378 | |
---|
379 | self.__set_user_state_info( |
---|
380 | { clusterDir : self.__cluster.jobId, } ) |
---|
381 | self.__opCode = allocateStatus |
---|
382 | except Exception, e: |
---|
383 | # Some unknown problem. |
---|
384 | self.__cleanup() |
---|
385 | self.__cluster.deallocate(clusterDir, self.__clusterStateInfo) |
---|
386 | self.__opCode = 1 |
---|
387 | raise Exception(e) |
---|
388 | elif self.__opCode == 12: |
---|
389 | self.__log.critical("Cluster %s already allocated." % clusterDir) |
---|
390 | elif self.__opCode == 10: |
---|
391 | self.__log.critical("dead\t%s\t%s" % (clusterInfo['jobid'], |
---|
392 | clusterDir)) |
---|
393 | elif self.__opCode == 13: |
---|
394 | self.__log.warn("hdfs dead\t%s\t%s" % (clusterInfo['jobid'], |
---|
395 | clusterDir)) |
---|
396 | elif self.__opCode == 14: |
---|
397 | self.__log.warn("mapred dead\t%s\t%s" % (clusterInfo['jobid'], |
---|
398 | clusterDir)) |
---|
399 | |
---|
400 | if self.__opCode > 0 and self.__opCode != 15: |
---|
401 | self.__log.critical("Cannot allocate cluster %s" % clusterDir) |
---|
402 | else: |
---|
403 | print self.__hodhelp.help(operation) |
---|
404 | self.__log.critical("%s operation requires two arguments. " % operation |
---|
405 | + "A cluster directory and a nodecount.") |
---|
406 | self.__opCode = 3 |
---|
407 | |
---|
408 | def _is_cluster_allocated(self, clusterDir): |
---|
409 | if os.path.isdir(clusterDir): |
---|
410 | self.__setup_cluster_state(clusterDir) |
---|
411 | clusterInfo = self.__clusterState.read() |
---|
412 | if clusterInfo != {}: |
---|
413 | return True |
---|
414 | return False |
---|
415 | |
---|
416 | def _op_deallocate(self, args): |
---|
417 | operation = "deallocate" |
---|
418 | argLength = len(args) |
---|
419 | if argLength == 2: |
---|
420 | clusterDir = self.__norm_cluster_dir(args[1]) |
---|
421 | if os.path.isdir(clusterDir): |
---|
422 | self.__setup_cluster_state(clusterDir) |
---|
423 | clusterInfo = self.__clusterState.read() |
---|
424 | if clusterInfo == {}: |
---|
425 | self.__handle_invalid_cluster_directory(clusterDir, cleanUp=True) |
---|
426 | else: |
---|
427 | self.__opCode = \ |
---|
428 | self.__cluster.deallocate(clusterDir, clusterInfo) |
---|
429 | # irrespective of whether deallocate failed or not\ |
---|
430 | # remove the cluster state. |
---|
431 | self.__clusterState.clear() |
---|
432 | if not self.__userState.checkStateFile(CLUSTER_DATA_FILE, (os.W_OK,)): |
---|
433 | self.__log.critical(INVALID_STATE_FILE_MSGS[3] % \ |
---|
434 | self.__userState.get_state_file()) |
---|
435 | self.__opCode = 1 |
---|
436 | return |
---|
437 | self.__remove_cluster(clusterDir) |
---|
438 | else: |
---|
439 | self.__handle_invalid_cluster_directory(clusterDir, cleanUp=True) |
---|
440 | else: |
---|
441 | print self.__hodhelp.help(operation) |
---|
442 | self.__log.critical("%s operation requires one argument. " % operation |
---|
443 | + "A cluster path.") |
---|
444 | self.__opCode = 3 |
---|
445 | |
---|
446 | def _op_list(self, args): |
---|
447 | operation = 'list' |
---|
448 | clusterList = self.__userState.read(CLUSTER_DATA_FILE) |
---|
449 | for path in clusterList.keys(): |
---|
450 | if not os.path.isdir(path): |
---|
451 | self.__log.info("cluster state unknown\t%s\t%s" % (clusterList[path], path)) |
---|
452 | continue |
---|
453 | self.__setup_cluster_state(path) |
---|
454 | clusterInfo = self.__clusterState.read() |
---|
455 | if clusterInfo == {}: |
---|
456 | # something wrong with the cluster directory. |
---|
457 | self.__log.info("cluster state unknown\t%s\t%s" % (clusterList[path], path)) |
---|
458 | continue |
---|
459 | clusterStatus = self.__cluster.check_cluster(clusterInfo) |
---|
460 | if clusterStatus == 12: |
---|
461 | self.__log.info("alive\t%s\t%s" % (clusterList[path], path)) |
---|
462 | elif clusterStatus == 10: |
---|
463 | self.__log.info("dead\t%s\t%s" % (clusterList[path], path)) |
---|
464 | elif clusterStatus == 13: |
---|
465 | self.__log.info("hdfs dead\t%s\t%s" % (clusterList[path], path)) |
---|
466 | elif clusterStatus == 14: |
---|
467 | self.__log.info("mapred dead\t%s\t%s" % (clusterList[path], path)) |
---|
468 | |
---|
469 | def _op_info(self, args): |
---|
470 | operation = 'info' |
---|
471 | argLength = len(args) |
---|
472 | if argLength == 2: |
---|
473 | clusterDir = self.__norm_cluster_dir(args[1]) |
---|
474 | if os.path.isdir(clusterDir): |
---|
475 | self.__setup_cluster_state(clusterDir) |
---|
476 | clusterInfo = self.__clusterState.read() |
---|
477 | if clusterInfo == {}: |
---|
478 | # something wrong with the cluster directory. |
---|
479 | self.__handle_invalid_cluster_directory(clusterDir) |
---|
480 | else: |
---|
481 | clusterStatus = self.__cluster.check_cluster(clusterInfo) |
---|
482 | if clusterStatus == 12: |
---|
483 | self.__print_cluster_info(clusterInfo) |
---|
484 | self.__log.info("hadoop-site.xml at %s" % clusterDir) |
---|
485 | elif clusterStatus == 10: |
---|
486 | self.__log.critical("%s cluster is dead" % clusterDir) |
---|
487 | elif clusterStatus == 13: |
---|
488 | self.__log.warn("%s cluster hdfs is dead" % clusterDir) |
---|
489 | elif clusterStatus == 14: |
---|
490 | self.__log.warn("%s cluster mapred is dead" % clusterDir) |
---|
491 | |
---|
492 | if clusterStatus != 12: |
---|
493 | if clusterStatus == 15: |
---|
494 | self.__log.critical("Cluster %s not allocated." % clusterDir) |
---|
495 | else: |
---|
496 | self.__print_cluster_info(clusterInfo) |
---|
497 | self.__log.info("hadoop-site.xml at %s" % clusterDir) |
---|
498 | |
---|
499 | self.__opCode = clusterStatus |
---|
500 | else: |
---|
501 | self.__handle_invalid_cluster_directory(clusterDir) |
---|
502 | else: |
---|
503 | print self.__hodhelp.help(operation) |
---|
504 | self.__log.critical("%s operation requires one argument. " % operation |
---|
505 | + "A cluster path.") |
---|
506 | self.__opCode = 3 |
---|
507 | |
---|
508 | def __handle_invalid_cluster_directory(self, clusterDir, cleanUp=False): |
---|
509 | if not self.__userState.checkStateFile(CLUSTER_DATA_FILE, (os.R_OK,)): |
---|
510 | self.__log.critical(INVALID_STATE_FILE_MSGS[0] % \ |
---|
511 | self.__userState.get_state_file()) |
---|
512 | self.__opCode = 1 |
---|
513 | return |
---|
514 | |
---|
515 | clusterList = self.__userState.read(CLUSTER_DATA_FILE) |
---|
516 | if clusterDir in clusterList.keys(): |
---|
517 | # previously allocated cluster. |
---|
518 | self.__log.critical("Cannot find information for cluster with id '%s' in previously allocated cluster directory '%s'." % (clusterList[clusterDir], clusterDir)) |
---|
519 | if cleanUp: |
---|
520 | self.__cluster.delete_job(clusterList[clusterDir]) |
---|
521 | self.__log.critical("Freeing resources allocated to the cluster.") |
---|
522 | if not self.__userState.checkStateFile(CLUSTER_DATA_FILE, (os.W_OK,)): |
---|
523 | self.__log.critical(INVALID_STATE_FILE_MSGS[1] % \ |
---|
524 | self.__userState.get_state_file()) |
---|
525 | self.__opCode = 1 |
---|
526 | return |
---|
527 | self.__remove_cluster(clusterDir) |
---|
528 | self.__opCode = 3 |
---|
529 | else: |
---|
530 | if not os.path.exists(clusterDir): |
---|
531 | self.__log.critical( \ |
---|
532 | "Invalid hod.clusterdir(--hod.clusterdir or -d). " + \ |
---|
533 | clusterDir + " : No such directory") |
---|
534 | elif not os.path.isdir(clusterDir): |
---|
535 | self.__log.critical( \ |
---|
536 | "Invalid hod.clusterdir(--hod.clusterdir or -d). " + \ |
---|
537 | clusterDir + " : Not a directory") |
---|
538 | else: |
---|
539 | self.__log.critical( \ |
---|
540 | "Invalid hod.clusterdir(--hod.clusterdir or -d). " + \ |
---|
541 | clusterDir + " : Not tied to any allocated cluster.") |
---|
542 | self.__opCode = 15 |
---|
543 | |
---|
544 | def __print_cluster_info(self, clusterInfo): |
---|
545 | keys = clusterInfo.keys() |
---|
546 | |
---|
547 | _dict = { |
---|
548 | 'jobid' : 'Cluster Id', 'min' : 'Nodecount', |
---|
549 | 'hdfs' : 'HDFS UI at' , 'mapred' : 'Mapred UI at' |
---|
550 | } |
---|
551 | |
---|
552 | for key in _dict.keys(): |
---|
553 | if clusterInfo.has_key(key): |
---|
554 | self.__log.info("%s %s" % (_dict[key], clusterInfo[key])) |
---|
555 | |
---|
556 | if clusterInfo.has_key('ring'): |
---|
557 | self.__log.debug("%s\t%s" % ('Ringmaster at ', clusterInfo['ring'])) |
---|
558 | |
---|
559 | if self.__cfg['hod']['debug'] == 4: |
---|
560 | for var in clusterInfo['env'].keys(): |
---|
561 | self.__log.debug("%s = %s" % (var, clusterInfo['env'][var])) |
---|
562 | |
---|
563 | def _op_help(self, arg): |
---|
564 | if arg == None or arg.__len__() != 2: |
---|
565 | print "hod commands:\n" |
---|
566 | for op in self.__ops: |
---|
567 | print self.__hodhelp.help(op) |
---|
568 | else: |
---|
569 | if arg[1] not in self.__ops: |
---|
570 | print self.__hodhelp.help('help') |
---|
571 | self.__log.critical("Help requested for invalid operation : %s"%arg[1]) |
---|
572 | self.__opCode = 3 |
---|
573 | else: print self.__hodhelp.help(arg[1]) |
---|
574 | |
---|
575 | def operation(self): |
---|
576 | operation = self.__cfg['hod']['operation'] |
---|
577 | try: |
---|
578 | opList = self.__check_operation(operation) |
---|
579 | if self.__opCode == 0: |
---|
580 | if not self.__userState.checkStateFile(CLUSTER_DATA_FILE, (os.R_OK,)): |
---|
581 | self.__log.critical(INVALID_STATE_FILE_MSGS[0] % \ |
---|
582 | self.__userState.get_state_file()) |
---|
583 | self.__opCode = 1 |
---|
584 | return self.__opCode |
---|
585 | getattr(self, "_op_%s" % opList[0])(opList) |
---|
586 | except HodInterruptException, h: |
---|
587 | self.__log.critical("op: %s failed because of a process interrupt." \ |
---|
588 | % operation) |
---|
589 | self.__opCode = HOD_INTERRUPTED_CODE |
---|
590 | except: |
---|
591 | self.__log.critical("op: %s failed: %s" % (operation, |
---|
592 | get_exception_error_string())) |
---|
593 | self.__log.debug(get_exception_string()) |
---|
594 | |
---|
595 | self.__cleanup() |
---|
596 | |
---|
597 | self.__log.debug("return code: %s" % self.__opCode) |
---|
598 | |
---|
599 | return self.__opCode |
---|
600 | |
---|
601 | def script(self): |
---|
602 | errorFlag = False |
---|
603 | errorMsgs = [] |
---|
604 | scriptRet = 0 # return from the script, if run |
---|
605 | |
---|
606 | script = self.__cfg['hod']['script'] |
---|
607 | nodes = self.__cfg['hod']['nodecount'] |
---|
608 | clusterDir = self.__cfg['hod']['clusterdir'] |
---|
609 | |
---|
610 | if not os.path.exists(script): |
---|
611 | errorFlag = True |
---|
612 | errorMsgs.append("Invalid script file (--hod.script or -s) : " + \ |
---|
613 | script + " : No such file") |
---|
614 | elif not os.path.isfile(script): |
---|
615 | errorFlag = True |
---|
616 | errorMsgs.append("Invalid script file (--hod.script or -s) : " + \ |
---|
617 | script + " : Not a file.") |
---|
618 | else: |
---|
619 | isExecutable = os.access(script, os.X_OK) |
---|
620 | if not isExecutable: |
---|
621 | errorFlag = True |
---|
622 | errorMsgs.append("Invalid script file (--hod.script or -s) : " + \ |
---|
623 | script + " : Not an executable.") |
---|
624 | |
---|
625 | if not os.path.exists(clusterDir): |
---|
626 | try: |
---|
627 | os.makedirs(clusterDir) |
---|
628 | except OSError, err: |
---|
629 | errorFlag = True |
---|
630 | errorMsgs.append("Could not create cluster directory. %s" % (str(err))) |
---|
631 | elif not os.path.isdir(clusterDir): |
---|
632 | errorFlag = True |
---|
633 | errorMsgs.append( \ |
---|
634 | "Invalid cluster directory (--hod.clusterdir or -d) : " + \ |
---|
635 | clusterDir + " : Not a directory") |
---|
636 | |
---|
637 | if int(self.__cfg['hod']['nodecount']) < 3 : |
---|
638 | errorFlag = True |
---|
639 | errorMsgs.append("Invalid nodecount (--hod.nodecount or -n) : " + \ |
---|
640 | "Must be >= 3. Given nodes: %s" % nodes) |
---|
641 | |
---|
642 | if errorFlag: |
---|
643 | for msg in errorMsgs: |
---|
644 | self.__log.critical(msg) |
---|
645 | self.handle_script_exit_code(scriptRet, clusterDir) |
---|
646 | sys.exit(3) |
---|
647 | |
---|
648 | try: |
---|
649 | self._op_allocate(('allocate', clusterDir, str(nodes))) |
---|
650 | if self.__opCode == 0: |
---|
651 | if self.__cfg['hod'].has_key('script-wait-time'): |
---|
652 | time.sleep(self.__cfg['hod']['script-wait-time']) |
---|
653 | self.__log.debug('Slept for %d time. Now going to run the script' % self.__cfg['hod']['script-wait-time']) |
---|
654 | if hodInterrupt.isSet(): |
---|
655 | self.__log.debug('Hod interrupted - not executing script') |
---|
656 | else: |
---|
657 | scriptRunner = hadoopScript(clusterDir, |
---|
658 | self.__cfg['hod']['original-dir']) |
---|
659 | self.__opCode = scriptRunner.run(script) |
---|
660 | scriptRet = self.__opCode |
---|
661 | self.__log.info("Exit code from running the script: %d" % self.__opCode) |
---|
662 | else: |
---|
663 | self.__log.critical("Error %d in allocating the cluster. Cannot run the script." % self.__opCode) |
---|
664 | |
---|
665 | if hodInterrupt.isSet(): |
---|
666 | # Got interrupt while executing script. Unsetting it for deallocating |
---|
667 | hodInterrupt.setFlag(False) |
---|
668 | if self._is_cluster_allocated(clusterDir): |
---|
669 | self._op_deallocate(('deallocate', clusterDir)) |
---|
670 | except HodInterruptException, h: |
---|
671 | self.__log.critical("Script failed because of a process interrupt.") |
---|
672 | self.__opCode = HOD_INTERRUPTED_CODE |
---|
673 | except: |
---|
674 | self.__log.critical("script: %s failed: %s" % (script, |
---|
675 | get_exception_error_string())) |
---|
676 | self.__log.debug(get_exception_string()) |
---|
677 | |
---|
678 | self.__cleanup() |
---|
679 | |
---|
680 | self.handle_script_exit_code(scriptRet, clusterDir) |
---|
681 | |
---|
682 | return self.__opCode |
---|
683 | |
---|
684 | def handle_script_exit_code(self, scriptRet, clusterDir): |
---|
685 | # We want to give importance to a failed script's exit code, and write out exit code to a file separately |
---|
686 | # so users can easily get it if required. This way they can differentiate between the script's exit code |
---|
687 | # and hod's exit code. |
---|
688 | if os.path.exists(clusterDir): |
---|
689 | exit_code_file_name = (os.path.join(clusterDir, 'script.exitcode')) |
---|
690 | if scriptRet != 0: |
---|
691 | exit_code_file = open(exit_code_file_name, 'w') |
---|
692 | print >>exit_code_file, scriptRet |
---|
693 | exit_code_file.close() |
---|
694 | self.__opCode = scriptRet |
---|
695 | else: |
---|
696 | #ensure script exit code file is not there: |
---|
697 | if (os.path.exists(exit_code_file_name)): |
---|
698 | os.remove(exit_code_file_name) |
---|
699 | |
---|
700 | class hodHelp: |
---|
701 | def __init__(self): |
---|
702 | self.ops = ['allocate', 'deallocate', 'info', 'list','script', 'help'] |
---|
703 | |
---|
704 | self.usage_strings = \ |
---|
705 | { |
---|
706 | 'allocate' : 'hod allocate -d <clusterdir> -n <nodecount> [OPTIONS]', |
---|
707 | 'deallocate' : 'hod deallocate -d <clusterdir> [OPTIONS]', |
---|
708 | 'list' : 'hod list [OPTIONS]', |
---|
709 | 'info' : 'hod info -d <clusterdir> [OPTIONS]', |
---|
710 | 'script' : |
---|
711 | 'hod script -d <clusterdir> -n <nodecount> -s <script> [OPTIONS]', |
---|
712 | 'help' : 'hod help <OPERATION>', |
---|
713 | } |
---|
714 | |
---|
715 | self.description_strings = \ |
---|
716 | { |
---|
717 | 'allocate' : "Allocates a cluster of n nodes using the specified \n" + \ |
---|
718 | " cluster directory to store cluster state \n" + \ |
---|
719 | " information. The Hadoop site XML is also stored \n" + \ |
---|
720 | " in this location.\n", |
---|
721 | |
---|
722 | 'deallocate' : "Deallocates a cluster using the specified \n" + \ |
---|
723 | " cluster directory. This operation is also \n" + \ |
---|
724 | " required to clean up a dead cluster.\n", |
---|
725 | |
---|
726 | 'list' : "List all clusters currently allocated by a user, \n" + \ |
---|
727 | " along with limited status information and the \n" + \ |
---|
728 | " cluster ID.\n", |
---|
729 | |
---|
730 | 'info' : "Provide detailed information on an allocated cluster.\n", |
---|
731 | |
---|
732 | 'script' : "Allocates a cluster of n nodes with the given \n" +\ |
---|
733 | " cluster directory, runs the specified script \n" + \ |
---|
734 | " using the allocated cluster, and then \n" + \ |
---|
735 | " deallocates the cluster.\n", |
---|
736 | |
---|
737 | 'help' : "Print help for the operation and exit.\n" + \ |
---|
738 | "Available operations : %s.\n" % self.ops, |
---|
739 | } |
---|
740 | |
---|
741 | def usage(self, op): |
---|
742 | return "Usage : " + self.usage_strings[op] + "\n" + \ |
---|
743 | "For full description: hod help " + op + ".\n" |
---|
744 | |
---|
745 | def help(self, op=None): |
---|
746 | if op is None: |
---|
747 | return "hod <operation> [ARGS] [OPTIONS]\n" + \ |
---|
748 | "Available operations : %s\n" % self.ops + \ |
---|
749 | "For help on a particular operation : hod help <operation>.\n" + \ |
---|
750 | "For all options : hod help options." |
---|
751 | else: |
---|
752 | return "Usage : " + self.usage_strings[op] + "\n" + \ |
---|
753 | "Description : " + self.description_strings[op] + \ |
---|
754 | "For all options : hod help options.\n" |
---|