1 | #Licensed to the Apache Software Foundation (ASF) under one |
---|
2 | #or more contributor license agreements. See the NOTICE file |
---|
3 | #distributed with this work for additional information |
---|
4 | #regarding copyright ownership. The ASF licenses this file |
---|
5 | #to you under the Apache License, Version 2.0 (the |
---|
6 | #"License"); you may not use this file except in compliance |
---|
7 | #with the License. You may obtain a copy of the License at |
---|
8 | |
---|
9 | # http://www.apache.org/licenses/LICENSE-2.0 |
---|
10 | |
---|
11 | #Unless required by applicable law or agreed to in writing, software |
---|
12 | #distributed under the License is distributed on an "AS IS" BASIS, |
---|
13 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
---|
14 | #See the License for the specific language governing permissions and |
---|
15 | #limitations under the License. |
---|
16 | """define WorkLoad as abstract interface for user job""" |
---|
17 | # -*- python -*- |
---|
18 | |
---|
19 | import os, time, sys, shutil, exceptions, re, threading, signal, urllib, pprint, math |
---|
20 | |
---|
21 | from HTMLParser import HTMLParser |
---|
22 | |
---|
23 | import xml.dom.minidom |
---|
24 | import xml.dom.pulldom |
---|
25 | from xml.dom import getDOMImplementation |
---|
26 | |
---|
27 | from hodlib.Common.util import * |
---|
28 | from hodlib.Common.xmlrpc import hodXRClient |
---|
29 | from hodlib.Common.miniHTMLParser import miniHTMLParser |
---|
30 | from hodlib.Common.nodepoolutil import NodePoolUtil |
---|
31 | from hodlib.Common.tcp import tcpError, tcpSocket |
---|
32 | |
---|
33 | reCommandDelimeterString = r"(?<!\\);" |
---|
34 | reCommandDelimeter = re.compile(reCommandDelimeterString) |
---|
35 | |
---|
36 | class hadoopConfig: |
---|
37 | def __create_xml_element(self, doc, name, value, description, final = False): |
---|
38 | prop = doc.createElement("property") |
---|
39 | nameP = doc.createElement("name") |
---|
40 | string = doc.createTextNode(name) |
---|
41 | nameP.appendChild(string) |
---|
42 | valueP = doc.createElement("value") |
---|
43 | string = doc.createTextNode(value) |
---|
44 | valueP.appendChild(string) |
---|
45 | if final: |
---|
46 | finalP = doc.createElement("final") |
---|
47 | string = doc.createTextNode("true") |
---|
48 | finalP.appendChild(string) |
---|
49 | desc = doc.createElement("description") |
---|
50 | string = doc.createTextNode(description) |
---|
51 | desc.appendChild(string) |
---|
52 | prop.appendChild(nameP) |
---|
53 | prop.appendChild(valueP) |
---|
54 | if final: |
---|
55 | prop.appendChild(finalP) |
---|
56 | prop.appendChild(desc) |
---|
57 | |
---|
58 | return prop |
---|
59 | |
---|
60 | def gen_site_conf(self, confDir, tempDir, numNodes, hdfsAddr, mrSysDir,\ |
---|
61 | mapredAddr=None, clientParams=None, serverParams=None,\ |
---|
62 | finalServerParams=None, clusterFactor=None): |
---|
63 | if not mapredAddr: |
---|
64 | mapredAddr = "dummy:8181" |
---|
65 | |
---|
66 | implementation = getDOMImplementation() |
---|
67 | doc = implementation.createDocument('', 'configuration', None) |
---|
68 | comment = doc.createComment( |
---|
69 | "This is an auto generated hadoop-site.xml, do not modify") |
---|
70 | topElement = doc.documentElement |
---|
71 | topElement.appendChild(comment) |
---|
72 | |
---|
73 | description = {} |
---|
74 | paramsDict = { 'mapred.job.tracker' : mapredAddr , \ |
---|
75 | 'fs.default.name' : "hdfs://" + hdfsAddr, \ |
---|
76 | 'hadoop.tmp.dir' : tempDir, \ |
---|
77 | } |
---|
78 | |
---|
79 | paramsDict['mapred.system.dir'] = mrSysDir |
---|
80 | |
---|
81 | # mapred-default.xml is no longer used now. |
---|
82 | numred = int(math.floor(clusterFactor * (int(numNodes) - 1))) |
---|
83 | paramsDict['mapred.reduce.tasks'] = str(numred) |
---|
84 | # end |
---|
85 | |
---|
86 | # for all the above vars generated, set the description |
---|
87 | for k, v in paramsDict.iteritems(): |
---|
88 | description[k] = 'Hod generated parameter' |
---|
89 | |
---|
90 | # finalservelParams |
---|
91 | if finalServerParams: |
---|
92 | for k, v in finalServerParams.iteritems(): |
---|
93 | if not description.has_key(k): |
---|
94 | description[k] = "final server parameter" |
---|
95 | paramsDict[k] = v |
---|
96 | |
---|
97 | # servelParams |
---|
98 | if serverParams: |
---|
99 | for k, v in serverParams.iteritems(): |
---|
100 | if not description.has_key(k): |
---|
101 | # if no final value for same param is mentioned |
---|
102 | description[k] = "server parameter" |
---|
103 | paramsDict[k] = v |
---|
104 | |
---|
105 | # clientParams |
---|
106 | if clientParams: |
---|
107 | for k, v in clientParams.iteritems(): |
---|
108 | if not description.has_key(k) or description[k] == "server parameter": |
---|
109 | # Just add, if no final value for same param is mentioned. |
---|
110 | # Replace even if server param is mentioned for same config variable |
---|
111 | description[k] = "client-side parameter" |
---|
112 | paramsDict[k] = v |
---|
113 | |
---|
114 | # generate the xml elements |
---|
115 | for k,v in paramsDict.iteritems(): |
---|
116 | if ( description[k] == "final server parameter" or \ |
---|
117 | description[k] == "Hod generated parameter" ): |
---|
118 | final = True |
---|
119 | else: final = False |
---|
120 | prop = self.__create_xml_element(doc, k, v, description[k], final) |
---|
121 | topElement.appendChild(prop) |
---|
122 | |
---|
123 | siteName = os.path.join(confDir, "hadoop-site.xml") |
---|
124 | sitefile = file(siteName, 'w') |
---|
125 | print >> sitefile, topElement.toxml() |
---|
126 | sitefile.close() |
---|
127 | |
---|
128 | class hadoopCluster: |
---|
129 | def __init__(self, cfg, log): |
---|
130 | self.__cfg = cfg |
---|
131 | self.__log = log |
---|
132 | self.__changedClusterParams = [] |
---|
133 | |
---|
134 | self.__hostname = local_fqdn() |
---|
135 | self.__svcrgyClient = None |
---|
136 | self.__nodePool = NodePoolUtil.getNodePool(self.__cfg['nodepooldesc'], |
---|
137 | self.__cfg, self.__log) |
---|
138 | self.__hadoopCfg = hadoopConfig() |
---|
139 | self.jobId = None |
---|
140 | self.mapredInfo = None |
---|
141 | self.hdfsInfo = None |
---|
142 | self.ringmasterXRS = None |
---|
143 | |
---|
144 | def __get_svcrgy_client(self): |
---|
145 | svcrgyUrl = to_http_url(self.__cfg['hod']['xrs-address']) |
---|
146 | return hodXRClient(svcrgyUrl) |
---|
147 | |
---|
148 | def __get_service_status(self): |
---|
149 | serviceData = self.__get_service_data() |
---|
150 | |
---|
151 | status = True |
---|
152 | hdfs = False |
---|
153 | mapred = False |
---|
154 | |
---|
155 | for host in serviceData.keys(): |
---|
156 | for item in serviceData[host]: |
---|
157 | service = item.keys() |
---|
158 | if service[0] == 'hdfs.grid' and \ |
---|
159 | self.__cfg['gridservice-hdfs']['external'] == False: |
---|
160 | hdfs = True |
---|
161 | elif service[0] == 'mapred.grid': |
---|
162 | mapred = True |
---|
163 | |
---|
164 | if not mapred: |
---|
165 | status = "mapred" |
---|
166 | |
---|
167 | if not hdfs and self.__cfg['gridservice-hdfs']['external'] == False: |
---|
168 | if status != True: |
---|
169 | status = "mapred and hdfs" |
---|
170 | else: |
---|
171 | status = "hdfs" |
---|
172 | |
---|
173 | return status |
---|
174 | |
---|
175 | def __get_service_data(self): |
---|
176 | registry = to_http_url(self.__cfg['hod']['xrs-address']) |
---|
177 | serviceData = self.__svcrgyClient.getServiceInfo( |
---|
178 | self.__cfg['hod']['userid'], self.__setup.np.getNodePoolId()) |
---|
179 | |
---|
180 | return serviceData |
---|
181 | |
---|
182 | def __check_job_status(self): |
---|
183 | failureCount = 0 |
---|
184 | status = False |
---|
185 | state = 'Q' |
---|
186 | userLimitsFirstFlag = True |
---|
187 | |
---|
188 | while (state=='Q') or (state==False): |
---|
189 | if hodInterrupt.isSet(): |
---|
190 | raise HodInterruptException() |
---|
191 | |
---|
192 | jobInfo = self.__nodePool.getJobInfo() |
---|
193 | state = jobInfo['job_state'] |
---|
194 | self.__log.debug('job state %s' % state) |
---|
195 | if state == False: |
---|
196 | failureCount += 1 |
---|
197 | if (failureCount >= self.__cfg['hod']['job-status-query-failure-retries']): |
---|
198 | self.__log.debug('Number of retries reached max limit while querying job status') |
---|
199 | break |
---|
200 | time.sleep(self.__cfg['hod']['job-command-failure-interval']) |
---|
201 | elif state!='Q': |
---|
202 | break |
---|
203 | else: |
---|
204 | self.__log.debug('querying for job status after job-status-query-interval') |
---|
205 | time.sleep(self.__cfg['hod']['job-status-query-interval']) |
---|
206 | |
---|
207 | if self.__cfg['hod'].has_key('job-feasibility-attr') and \ |
---|
208 | self.__cfg['hod']['job-feasibility-attr']: |
---|
209 | (status, msg) = self.__isJobFeasible() |
---|
210 | if status == "Never": |
---|
211 | self.__log.critical(TORQUE_USER_LIMITS_EXCEEDED_MSG + msg + \ |
---|
212 | "This cluster cannot be allocated now.") |
---|
213 | return -1 |
---|
214 | elif status == False: |
---|
215 | if userLimitsFirstFlag: |
---|
216 | self.__log.critical(TORQUE_USER_LIMITS_EXCEEDED_MSG + msg + \ |
---|
217 | "This cluster allocation will succeed only after other " + \ |
---|
218 | "clusters are deallocated.") |
---|
219 | userLimitsFirstFlag = False |
---|
220 | |
---|
221 | if state and state != 'C': |
---|
222 | status = True |
---|
223 | |
---|
224 | return status |
---|
225 | |
---|
226 | def __isJobFeasible(self): |
---|
227 | return self.__nodePool.isJobFeasible() |
---|
228 | |
---|
229 | def __get_ringmaster_client(self): |
---|
230 | ringmasterXRS = None |
---|
231 | |
---|
232 | ringList = self.__svcrgyClient.getServiceInfo( |
---|
233 | self.__cfg['ringmaster']['userid'], self.__nodePool.getServiceId(), |
---|
234 | 'ringmaster', 'hod') |
---|
235 | |
---|
236 | if ringList and len(ringList): |
---|
237 | if isinstance(ringList, list): |
---|
238 | ringmasterXRS = ringList[0]['xrs'] |
---|
239 | else: |
---|
240 | count = 0 |
---|
241 | waitTime = self.__cfg['hod']['allocate-wait-time'] |
---|
242 | |
---|
243 | while count < waitTime: |
---|
244 | if hodInterrupt.isSet(): |
---|
245 | raise HodInterruptException() |
---|
246 | |
---|
247 | ringList = self.__svcrgyClient.getServiceInfo( |
---|
248 | self.__cfg['ringmaster']['userid'], self.__nodePool.getServiceId(), |
---|
249 | 'ringmaster', |
---|
250 | 'hod') |
---|
251 | |
---|
252 | if ringList and len(ringList): |
---|
253 | if isinstance(ringList, list): |
---|
254 | ringmasterXRS = ringList[0]['xrs'] |
---|
255 | |
---|
256 | if ringmasterXRS is not None: |
---|
257 | break |
---|
258 | else: |
---|
259 | time.sleep(1) |
---|
260 | count = count + 1 |
---|
261 | # check to see if the job exited by any chance in that time: |
---|
262 | if (count % self.__cfg['hod']['job-status-query-interval'] == 0): |
---|
263 | if not self.__check_job_status(): |
---|
264 | break |
---|
265 | return ringmasterXRS |
---|
266 | |
---|
267 | def __init_hadoop_service(self, serviceName, xmlrpcClient): |
---|
268 | status = True |
---|
269 | serviceAddress = None |
---|
270 | serviceInfo = None |
---|
271 | |
---|
272 | for i in range(0, 250): |
---|
273 | try: |
---|
274 | if hodInterrupt.isSet(): |
---|
275 | raise HodInterruptException() |
---|
276 | |
---|
277 | serviceAddress = xmlrpcClient.getServiceAddr(serviceName) |
---|
278 | if serviceAddress: |
---|
279 | if serviceAddress == 'not found': |
---|
280 | time.sleep(1) |
---|
281 | # check to see if the job exited by any chance in that time: |
---|
282 | if ((i+1) % self.__cfg['hod']['job-status-query-interval'] == 0): |
---|
283 | if not self.__check_job_status(): |
---|
284 | break |
---|
285 | else: |
---|
286 | serviceInfo = xmlrpcClient.getURLs(serviceName) |
---|
287 | break |
---|
288 | except HodInterruptException,h : |
---|
289 | raise h |
---|
290 | except: |
---|
291 | self.__log.critical("'%s': ringmaster xmlrpc error." % serviceName) |
---|
292 | self.__log.debug(get_exception_string()) |
---|
293 | status = False |
---|
294 | break |
---|
295 | |
---|
296 | if serviceAddress == 'not found' or not serviceAddress: |
---|
297 | self.__log.critical("Failed to retrieve '%s' service address." % |
---|
298 | serviceName) |
---|
299 | status = False |
---|
300 | elif serviceAddress.startswith("Error: "): |
---|
301 | errs = serviceAddress[len("Error: "):] |
---|
302 | self.__log.critical("Cluster could not be allocated because of the following errors.\n%s" % \ |
---|
303 | errs) |
---|
304 | status = False |
---|
305 | else: |
---|
306 | try: |
---|
307 | self.__svcrgyClient.registerService(self.__cfg['hodring']['userid'], |
---|
308 | self.jobId, self.__hostname, |
---|
309 | serviceName, 'grid', serviceInfo) |
---|
310 | |
---|
311 | except HodInterruptException, h: |
---|
312 | raise h |
---|
313 | except: |
---|
314 | self.__log.critical("'%s': registry xmlrpc error." % serviceName) |
---|
315 | self.__log.debug(get_exception_string()) |
---|
316 | status = False |
---|
317 | |
---|
318 | return status, serviceAddress, serviceInfo |
---|
319 | |
---|
320 | def __collect_jobtracker_ui(self, dir): |
---|
321 | |
---|
322 | link = self.mapredInfo + "/jobtracker.jsp" |
---|
323 | parser = miniHTMLParser() |
---|
324 | parser.setBaseUrl(self.mapredInfo) |
---|
325 | node_cache = {} |
---|
326 | |
---|
327 | self.__log.debug("collect_jobtracker_ui seeded with " + link) |
---|
328 | |
---|
329 | def alarm_handler(number, stack): |
---|
330 | raise AlarmException("timeout") |
---|
331 | |
---|
332 | signal.signal(signal.SIGALRM, alarm_handler) |
---|
333 | |
---|
334 | input = None |
---|
335 | while link: |
---|
336 | self.__log.debug("link: %s" % link) |
---|
337 | # taskstats.jsp,taskdetails.jsp not included since too many to collect |
---|
338 | if re.search( |
---|
339 | "jobfailures\.jsp|jobtracker\.jsp|jobdetails\.jsp|jobtasks\.jsp", |
---|
340 | link): |
---|
341 | |
---|
342 | for i in range(1,5): |
---|
343 | if hodInterrupt.isSet(): |
---|
344 | raise HodInterruptException() |
---|
345 | try: |
---|
346 | input = urllib.urlopen(link) |
---|
347 | break |
---|
348 | except: |
---|
349 | self.__log.debug(get_exception_string()) |
---|
350 | time.sleep(1) |
---|
351 | |
---|
352 | if input: |
---|
353 | out = None |
---|
354 | |
---|
355 | self.__log.debug("collecting " + link + "...") |
---|
356 | filename = re.sub(self.mapredInfo, "", link) |
---|
357 | filename = dir + "/" + filename |
---|
358 | filename = re.sub("http://","", filename) |
---|
359 | filename = re.sub("[\?\&=:]","_",filename) |
---|
360 | filename = filename + ".html" |
---|
361 | |
---|
362 | try: |
---|
363 | tempdir, tail = os.path.split(filename) |
---|
364 | if not os.path.exists(tempdir): |
---|
365 | os.makedirs(tempdir) |
---|
366 | except: |
---|
367 | self.__log.debug(get_exception_string()) |
---|
368 | |
---|
369 | out = open(filename, 'w') |
---|
370 | |
---|
371 | bufSz = 8192 |
---|
372 | |
---|
373 | signal.alarm(10) |
---|
374 | |
---|
375 | try: |
---|
376 | self.__log.debug("Starting to grab: %s" % link) |
---|
377 | buf = input.read(bufSz) |
---|
378 | |
---|
379 | while len(buf) > 0: |
---|
380 | # Feed the file into the HTML parser |
---|
381 | parser.feed(buf) |
---|
382 | |
---|
383 | # Re-write the hrefs in the file |
---|
384 | p = re.compile("\?(.+?)=(.+?)") |
---|
385 | buf = p.sub(r"_\1_\2",buf) |
---|
386 | p= re.compile("&(.+?)=(.+?)") |
---|
387 | buf = p.sub(r"_\1_\2",buf) |
---|
388 | p = re.compile("http://(.+?):(\d+)?") |
---|
389 | buf = p.sub(r"\1_\2/",buf) |
---|
390 | buf = re.sub("href=\"/","href=\"",buf) |
---|
391 | p = re.compile("href=\"(.+?)\"") |
---|
392 | buf = p.sub(r"href=\1.html",buf) |
---|
393 | |
---|
394 | out.write(buf) |
---|
395 | buf = input.read(bufSz) |
---|
396 | |
---|
397 | signal.alarm(0) |
---|
398 | input.close() |
---|
399 | if out: |
---|
400 | out.close() |
---|
401 | |
---|
402 | self.__log.debug("Finished grabbing: %s" % link) |
---|
403 | except AlarmException: |
---|
404 | if hodInterrupt.isSet(): |
---|
405 | raise HodInterruptException() |
---|
406 | if out: out.close() |
---|
407 | if input: input.close() |
---|
408 | |
---|
409 | self.__log.debug("Failed to retrieve: %s" % link) |
---|
410 | else: |
---|
411 | self.__log.debug("Failed to retrieve: %s" % link) |
---|
412 | |
---|
413 | # Get the next link in level traversal order |
---|
414 | link = parser.getNextLink() |
---|
415 | |
---|
416 | parser.close() |
---|
417 | |
---|
418 | def check_cluster(self, clusterInfo): |
---|
419 | status = 0 |
---|
420 | |
---|
421 | if 'mapred' in clusterInfo: |
---|
422 | mapredAddress = clusterInfo['mapred'][7:] |
---|
423 | hdfsAddress = clusterInfo['hdfs'][7:] |
---|
424 | status = get_cluster_status(hdfsAddress, mapredAddress) |
---|
425 | if status == 0: |
---|
426 | status = 12 |
---|
427 | else: |
---|
428 | status = 15 |
---|
429 | |
---|
430 | return status |
---|
431 | |
---|
432 | def is_cluster_deallocated(self, jobId): |
---|
433 | """Returns True if the JobId that represents this cluster |
---|
434 | is in the Completed or exiting state.""" |
---|
435 | jobInfo = self.__nodePool.getJobInfo(jobId) |
---|
436 | state = None |
---|
437 | if jobInfo is not None and jobInfo.has_key('job_state'): |
---|
438 | state = jobInfo['job_state'] |
---|
439 | return ((state == 'C') or (state == 'E')) |
---|
440 | |
---|
441 | def cleanup(self): |
---|
442 | if self.__nodePool: self.__nodePool.finalize() |
---|
443 | |
---|
444 | def get_job_id(self): |
---|
445 | return self.jobId |
---|
446 | |
---|
447 | def delete_job(self, jobId): |
---|
448 | '''Delete a job given it's ID''' |
---|
449 | ret = 0 |
---|
450 | if self.__nodePool: |
---|
451 | ret = self.__nodePool.deleteJob(jobId) |
---|
452 | else: |
---|
453 | raise Exception("Invalid state: Node pool is not initialized to delete the given job.") |
---|
454 | return ret |
---|
455 | |
---|
456 | def is_valid_account(self): |
---|
457 | """Verify if the account being used to submit the job is a valid account. |
---|
458 | This code looks for a file <install-dir>/bin/verify-account. |
---|
459 | If the file is present, it executes the file, passing as argument |
---|
460 | the account name. It returns the exit code and output from the |
---|
461 | script on non-zero exit code.""" |
---|
462 | |
---|
463 | accountValidationScript = os.path.abspath('./verify-account') |
---|
464 | if not os.path.exists(accountValidationScript): |
---|
465 | return (0, None) |
---|
466 | |
---|
467 | account = self.__nodePool.getAccountString() |
---|
468 | exitCode = 0 |
---|
469 | errMsg = None |
---|
470 | try: |
---|
471 | accountValidationCmd = simpleCommand('Account Validation Command',\ |
---|
472 | '%s %s' % (accountValidationScript, |
---|
473 | account)) |
---|
474 | accountValidationCmd.start() |
---|
475 | accountValidationCmd.wait() |
---|
476 | accountValidationCmd.join() |
---|
477 | exitCode = accountValidationCmd.exit_code() |
---|
478 | self.__log.debug('account validation script is run %d' \ |
---|
479 | % exitCode) |
---|
480 | errMsg = None |
---|
481 | if exitCode is not 0: |
---|
482 | errMsg = accountValidationCmd.output() |
---|
483 | except Exception, e: |
---|
484 | exitCode = 0 |
---|
485 | self.__log.warn('Error executing account script: %s ' \ |
---|
486 | 'Accounting is disabled.' \ |
---|
487 | % get_exception_error_string()) |
---|
488 | self.__log.debug(get_exception_string()) |
---|
489 | return (exitCode, errMsg) |
---|
490 | |
---|
491 | def allocate(self, clusterDir, min, max=None): |
---|
492 | status = 0 |
---|
493 | failureCount = 0 |
---|
494 | self.__svcrgyClient = self.__get_svcrgy_client() |
---|
495 | |
---|
496 | self.__log.debug("allocate %s %s %s" % (clusterDir, min, max)) |
---|
497 | |
---|
498 | if min < 3: |
---|
499 | self.__log.critical("Minimum nodes must be greater than 2.") |
---|
500 | status = 2 |
---|
501 | else: |
---|
502 | nodeSet = self.__nodePool.newNodeSet(min) |
---|
503 | walltime = None |
---|
504 | if self.__cfg['hod'].has_key('walltime'): |
---|
505 | walltime = self.__cfg['hod']['walltime'] |
---|
506 | self.jobId, exitCode = self.__nodePool.submitNodeSet(nodeSet, walltime) |
---|
507 | # if the job submission returned an error other than no resources |
---|
508 | # retry a couple of times |
---|
509 | while (self.jobId is False) and (exitCode != 188): |
---|
510 | if hodInterrupt.isSet(): |
---|
511 | raise HodInterruptException() |
---|
512 | |
---|
513 | failureCount += 1 |
---|
514 | if (failureCount >= self.__cfg['hod']['job-status-query-failure-retries']): |
---|
515 | self.__log.debug("failed submitting job more than the retries. exiting") |
---|
516 | break |
---|
517 | else: |
---|
518 | # wait a bit before retrying |
---|
519 | time.sleep(self.__cfg['hod']['job-command-failure-interval']) |
---|
520 | if hodInterrupt.isSet(): |
---|
521 | raise HodInterruptException() |
---|
522 | self.jobId, exitCode = self.__nodePool.submitNodeSet(nodeSet, walltime) |
---|
523 | |
---|
524 | if self.jobId: |
---|
525 | jobStatus = None |
---|
526 | try: |
---|
527 | jobStatus = self.__check_job_status() |
---|
528 | except HodInterruptException, h: |
---|
529 | self.__log.info(HOD_INTERRUPTED_MESG) |
---|
530 | self.delete_job(self.jobId) |
---|
531 | self.__log.info("Cluster %s removed from queue." % self.jobId) |
---|
532 | raise h |
---|
533 | else: |
---|
534 | if jobStatus == -1: |
---|
535 | self.delete_job(self.jobId); |
---|
536 | status = 4 |
---|
537 | return status |
---|
538 | |
---|
539 | if jobStatus: |
---|
540 | self.__log.info("Cluster Id %s" \ |
---|
541 | % self.jobId) |
---|
542 | try: |
---|
543 | self.ringmasterXRS = self.__get_ringmaster_client() |
---|
544 | |
---|
545 | self.__log.debug("Ringmaster at : %s" % self.ringmasterXRS ) |
---|
546 | ringClient = None |
---|
547 | if self.ringmasterXRS: |
---|
548 | ringClient = hodXRClient(self.ringmasterXRS) |
---|
549 | |
---|
550 | hdfsStatus, hdfsAddr, self.hdfsInfo = \ |
---|
551 | self.__init_hadoop_service('hdfs', ringClient) |
---|
552 | |
---|
553 | if hdfsStatus: |
---|
554 | self.__log.info("HDFS UI at http://%s" % self.hdfsInfo) |
---|
555 | |
---|
556 | mapredStatus, mapredAddr, self.mapredInfo = \ |
---|
557 | self.__init_hadoop_service('mapred', ringClient) |
---|
558 | |
---|
559 | if mapredStatus: |
---|
560 | self.__log.info("Mapred UI at http://%s" % self.mapredInfo) |
---|
561 | |
---|
562 | if self.__cfg['hod'].has_key('update-worker-info') \ |
---|
563 | and self.__cfg['hod']['update-worker-info']: |
---|
564 | workerInfoMap = {} |
---|
565 | workerInfoMap['HDFS UI'] = 'http://%s' % self.hdfsInfo |
---|
566 | workerInfoMap['Mapred UI'] = 'http://%s' % self.mapredInfo |
---|
567 | # Ringmaster URL sample format : http://hostname:port/ |
---|
568 | workerInfoMap['RM RPC Port'] = '%s' % self.ringmasterXRS.split(":")[2].strip("/") |
---|
569 | if mapredAddr.find(':') != -1: |
---|
570 | workerInfoMap['Mapred RPC Port'] = mapredAddr.split(':')[1] |
---|
571 | ret = self.__nodePool.updateWorkerInfo(workerInfoMap, self.jobId) |
---|
572 | if ret != 0: |
---|
573 | self.__log.warn('Could not update HDFS and Mapred information.' \ |
---|
574 | 'User Portal may not show relevant information.' \ |
---|
575 | 'Error code=%s' % ret) |
---|
576 | |
---|
577 | self.__cfg.replace_escape_seqs() |
---|
578 | |
---|
579 | # Go generate the client side hadoop-site.xml now |
---|
580 | # adding final-params as well, just so that conf on |
---|
581 | # client-side and server-side are (almost) the same |
---|
582 | clientParams = None |
---|
583 | serverParams = {} |
---|
584 | finalServerParams = {} |
---|
585 | |
---|
586 | # client-params |
---|
587 | if self.__cfg['hod'].has_key('client-params'): |
---|
588 | clientParams = self.__cfg['hod']['client-params'] |
---|
589 | |
---|
590 | # server-params |
---|
591 | if self.__cfg['gridservice-mapred'].has_key('server-params'): |
---|
592 | serverParams.update(\ |
---|
593 | self.__cfg['gridservice-mapred']['server-params']) |
---|
594 | if self.__cfg['gridservice-hdfs'].has_key('server-params'): |
---|
595 | # note that if there are params in both mapred and hdfs |
---|
596 | # sections, the ones in hdfs overwirte the ones in mapred |
---|
597 | serverParams.update(\ |
---|
598 | self.__cfg['gridservice-hdfs']['server-params']) |
---|
599 | |
---|
600 | # final-server-params |
---|
601 | if self.__cfg['gridservice-mapred'].has_key(\ |
---|
602 | 'final-server-params'): |
---|
603 | finalServerParams.update(\ |
---|
604 | self.__cfg['gridservice-mapred']['final-server-params']) |
---|
605 | if self.__cfg['gridservice-hdfs'].has_key( |
---|
606 | 'final-server-params'): |
---|
607 | finalServerParams.update(\ |
---|
608 | self.__cfg['gridservice-hdfs']['final-server-params']) |
---|
609 | |
---|
610 | clusterFactor = self.__cfg['hod']['cluster-factor'] |
---|
611 | tempDir = self.__cfg['hod']['temp-dir'] |
---|
612 | if not os.path.exists(tempDir): |
---|
613 | os.makedirs(tempDir) |
---|
614 | tempDir = os.path.join( tempDir, self.__cfg['hod']['userid']\ |
---|
615 | + "." + self.jobId ) |
---|
616 | mrSysDir = getMapredSystemDirectory(self.__cfg['hodring']['mapred-system-dir-root'],\ |
---|
617 | self.__cfg['hod']['userid'], self.jobId) |
---|
618 | self.__hadoopCfg.gen_site_conf(clusterDir, tempDir, min,\ |
---|
619 | hdfsAddr, mrSysDir, mapredAddr, clientParams,\ |
---|
620 | serverParams, finalServerParams,\ |
---|
621 | clusterFactor) |
---|
622 | self.__log.info("hadoop-site.xml at %s" % clusterDir) |
---|
623 | # end of hadoop-site.xml generation |
---|
624 | else: |
---|
625 | status = 8 |
---|
626 | else: |
---|
627 | status = 7 |
---|
628 | else: |
---|
629 | status = 6 |
---|
630 | if status != 0: |
---|
631 | self.__log.debug("Cleaning up cluster id %s, as cluster could not be allocated." % self.jobId) |
---|
632 | if ringClient is None: |
---|
633 | self.delete_job(self.jobId) |
---|
634 | else: |
---|
635 | self.__log.debug("Calling rm.stop()") |
---|
636 | ringClient.stopRM() |
---|
637 | self.__log.debug("Returning from rm.stop()") |
---|
638 | except HodInterruptException, h: |
---|
639 | self.__log.info(HOD_INTERRUPTED_MESG) |
---|
640 | if self.ringmasterXRS: |
---|
641 | if ringClient is None: |
---|
642 | ringClient = hodXRClient(self.ringmasterXRS) |
---|
643 | self.__log.debug("Calling rm.stop()") |
---|
644 | ringClient.stopRM() |
---|
645 | self.__log.debug("Returning from rm.stop()") |
---|
646 | self.__log.info("Cluster Shutdown by informing ringmaster.") |
---|
647 | else: |
---|
648 | self.delete_job(self.jobId) |
---|
649 | self.__log.info("Cluster %s removed from queue directly." % self.jobId) |
---|
650 | raise h |
---|
651 | else: |
---|
652 | self.__log.critical("No cluster found, ringmaster failed to run.") |
---|
653 | status = 5 |
---|
654 | |
---|
655 | elif self.jobId == False: |
---|
656 | if exitCode == 188: |
---|
657 | self.__log.critical("Request execeeded maximum resource allocation.") |
---|
658 | else: |
---|
659 | self.__log.critical("Job submission failed with exit code %s" % exitCode) |
---|
660 | status = 4 |
---|
661 | else: |
---|
662 | self.__log.critical("Scheduler failure, allocation failed.\n\n") |
---|
663 | status = 4 |
---|
664 | |
---|
665 | if status == 5 or status == 6: |
---|
666 | ringMasterErrors = self.__svcrgyClient.getRMError() |
---|
667 | if ringMasterErrors: |
---|
668 | self.__log.critical("Cluster could not be allocated because" \ |
---|
669 | " of the following errors on the "\ |
---|
670 | "ringmaster host %s.\n%s" % \ |
---|
671 | (ringMasterErrors[0], ringMasterErrors[1])) |
---|
672 | self.__log.debug("Stack trace on ringmaster: %s" % ringMasterErrors[2]) |
---|
673 | return status |
---|
674 | |
---|
675 | def __isRingMasterAlive(self, rmAddr): |
---|
676 | ret = True |
---|
677 | rmSocket = tcpSocket(rmAddr) |
---|
678 | try: |
---|
679 | rmSocket.open() |
---|
680 | rmSocket.close() |
---|
681 | except tcpError: |
---|
682 | ret = False |
---|
683 | |
---|
684 | return ret |
---|
685 | |
---|
686 | def deallocate(self, clusterDir, clusterInfo): |
---|
687 | status = 0 |
---|
688 | |
---|
689 | nodeSet = self.__nodePool.newNodeSet(clusterInfo['min'], |
---|
690 | id=clusterInfo['jobid']) |
---|
691 | self.mapredInfo = clusterInfo['mapred'] |
---|
692 | self.hdfsInfo = clusterInfo['hdfs'] |
---|
693 | |
---|
694 | try: |
---|
695 | if self.__cfg['hod'].has_key('hadoop-ui-log-dir'): |
---|
696 | clusterStatus = self.check_cluster(clusterInfo) |
---|
697 | if clusterStatus != 14 and clusterStatus != 10: |
---|
698 | # If JT is still alive |
---|
699 | self.__collect_jobtracker_ui(self.__cfg['hod']['hadoop-ui-log-dir']) |
---|
700 | else: |
---|
701 | self.__log.debug('hadoop-ui-log-dir not specified. Skipping Hadoop UI log collection.') |
---|
702 | except HodInterruptException, h: |
---|
703 | # got an interrupt. just pass and proceed to qdel |
---|
704 | pass |
---|
705 | except: |
---|
706 | self.__log.info("Exception in collecting Job tracker logs. Ignoring.") |
---|
707 | |
---|
708 | rmAddr = None |
---|
709 | if clusterInfo.has_key('ring'): |
---|
710 | # format is http://host:port/ We need host:port |
---|
711 | rmAddr = clusterInfo['ring'][7:] |
---|
712 | if rmAddr.endswith('/'): |
---|
713 | rmAddr = rmAddr[:-1] |
---|
714 | |
---|
715 | if (rmAddr is None) or (not self.__isRingMasterAlive(rmAddr)): |
---|
716 | # Cluster is already dead, don't try to contact ringmaster. |
---|
717 | self.__nodePool.finalize() |
---|
718 | status = 10 # As cluster is dead, we just set the status to 'cluster dead'. |
---|
719 | else: |
---|
720 | xrsAddr = clusterInfo['ring'] |
---|
721 | rmClient = hodXRClient(xrsAddr) |
---|
722 | self.__log.debug('calling rm.stop') |
---|
723 | rmClient.stopRM() |
---|
724 | self.__log.debug('completed rm.stop') |
---|
725 | |
---|
726 | # cleanup hod temp dirs |
---|
727 | tempDir = os.path.join( self.__cfg['hod']['temp-dir'], \ |
---|
728 | self.__cfg['hod']['userid'] + "." + clusterInfo['jobid'] ) |
---|
729 | if os.path.exists(tempDir): |
---|
730 | shutil.rmtree(tempDir) |
---|
731 | |
---|
732 | return status |
---|
733 | |
---|
734 | class hadoopScript: |
---|
735 | def __init__(self, conf, execDir): |
---|
736 | self.__environ = os.environ.copy() |
---|
737 | self.__environ['HADOOP_CONF_DIR'] = conf |
---|
738 | self.__execDir = execDir |
---|
739 | |
---|
740 | def run(self, script): |
---|
741 | scriptThread = simpleCommand(script, script, self.__environ, 4, False, |
---|
742 | False, self.__execDir) |
---|
743 | scriptThread.start() |
---|
744 | scriptThread.wait() |
---|
745 | scriptThread.join() |
---|
746 | |
---|
747 | return scriptThread.exit_code() |
---|