source: proiecte/HadoopJUnit/hadoop-0.20.1/docs/hod_admin_guide.html

Last change on this file was 120, checked in by (none), 14 years ago

Added the mail files for the Hadoop JUNit Project

  • Property svn:executable set to *
File size: 27.7 KB
Line 
1<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
2<html>
3<head>
4<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
5<meta content="Apache Forrest" name="Generator">
6<meta name="Forrest-version" content="0.8">
7<meta name="Forrest-skin-name" content="pelt">
8<title> 
9      HOD Administrator Guide
10    </title>
11<link type="text/css" href="skin/basic.css" rel="stylesheet">
12<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
13<link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
14<link type="text/css" href="skin/profile.css" rel="stylesheet">
15<script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
16<link rel="shortcut icon" href="images/favicon.ico">
17</head>
18<body onload="init()">
19<script type="text/javascript">ndeSetTextSize();</script>
20<div id="top">
21<!--+
22    |breadtrail
23    +-->
24<div class="breadtrail">
25<a href="http://www.apache.org/">Apache</a> &gt; <a href="http://hadoop.apache.org/">Hadoop</a> &gt; <a href="http://hadoop.apache.org/core/">Core</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
26</div>
27<!--+
28    |header
29    +-->
30<div class="header">
31<!--+
32    |start group logo
33    +-->
34<div class="grouplogo">
35<a href="http://hadoop.apache.org/"><img class="logoImage" alt="Hadoop" src="images/hadoop-logo.jpg" title="Apache Hadoop"></a>
36</div>
37<!--+
38    |end group logo
39    +-->
40<!--+
41    |start Project Logo
42    +-->
43<div class="projectlogo">
44<a href="http://hadoop.apache.org/core/"><img class="logoImage" alt="Hadoop" src="images/core-logo.gif" title="Scalable Computing Platform"></a>
45</div>
46<!--+
47    |end Project Logo
48    +-->
49<!--+
50    |start Search
51    +-->
52<div class="searchbox">
53<form action="http://www.google.com/search" method="get" class="roundtopsmall">
54<input value="hadoop.apache.org" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google">&nbsp; 
55                    <input name="Search" value="Search" type="submit">
56</form>
57</div>
58<!--+
59    |end search
60    +-->
61<!--+
62    |start Tabs
63    +-->
64<ul id="tabs">
65<li>
66<a class="unselected" href="http://hadoop.apache.org/core/">Project</a>
67</li>
68<li>
69<a class="unselected" href="http://wiki.apache.org/hadoop">Wiki</a>
70</li>
71<li class="current">
72<a class="selected" href="index.html">Hadoop 0.20 Documentation</a>
73</li>
74</ul>
75<!--+
76    |end Tabs
77    +-->
78</div>
79</div>
80<div id="main">
81<div id="publishedStrip">
82<!--+
83    |start Subtabs
84    +-->
85<div id="level2tabs"></div>
86<!--+
87    |end Endtabs
88    +-->
89<script type="text/javascript"><!--
90document.write("Last Published: " + document.lastModified);
91//  --></script>
92</div>
93<!--+
94    |breadtrail
95    +-->
96<div class="breadtrail">
97
98             &nbsp;
99           </div>
100<!--+
101    |start Menu, mainarea
102    +-->
103<!--+
104    |start Menu
105    +-->
106<div id="menu">
107<div onclick="SwitchMenu('menu_1.1', 'skin/')" id="menu_1.1Title" class="menutitle">Getting Started</div>
108<div id="menu_1.1" class="menuitemgroup">
109<div class="menuitem">
110<a href="index.html">Overview</a>
111</div>
112<div class="menuitem">
113<a href="quickstart.html">Quick Start</a>
114</div>
115<div class="menuitem">
116<a href="cluster_setup.html">Cluster Setup</a>
117</div>
118<div class="menuitem">
119<a href="mapred_tutorial.html">Map/Reduce Tutorial</a>
120</div>
121</div>
122<div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle">Programming Guides</div>
123<div id="menu_1.2" class="menuitemgroup">
124<div class="menuitem">
125<a href="commands_manual.html">Commands</a>
126</div>
127<div class="menuitem">
128<a href="distcp.html">DistCp</a>
129</div>
130<div class="menuitem">
131<a href="native_libraries.html">Native Libraries</a>
132</div>
133<div class="menuitem">
134<a href="streaming.html">Streaming</a>
135</div>
136<div class="menuitem">
137<a href="fair_scheduler.html">Fair Scheduler</a>
138</div>
139<div class="menuitem">
140<a href="capacity_scheduler.html">Capacity Scheduler</a>
141</div>
142<div class="menuitem">
143<a href="service_level_auth.html">Service Level Authorization</a>
144</div>
145<div class="menuitem">
146<a href="vaidya.html">Vaidya</a>
147</div>
148<div class="menuitem">
149<a href="hadoop_archives.html">Archives</a>
150</div>
151</div>
152<div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle">HDFS</div>
153<div id="menu_1.3" class="menuitemgroup">
154<div class="menuitem">
155<a href="hdfs_user_guide.html">User Guide</a>
156</div>
157<div class="menuitem">
158<a href="hdfs_design.html">Architecture</a>
159</div>
160<div class="menuitem">
161<a href="hdfs_shell.html">File System Shell Guide</a>
162</div>
163<div class="menuitem">
164<a href="hdfs_permissions_guide.html">Permissions Guide</a>
165</div>
166<div class="menuitem">
167<a href="hdfs_quota_admin_guide.html">Quotas Guide</a>
168</div>
169<div class="menuitem">
170<a href="SLG_user_guide.html">Synthetic Load Generator Guide</a>
171</div>
172<div class="menuitem">
173<a href="libhdfs.html">C API libhdfs</a>
174</div>
175</div>
176<div onclick="SwitchMenu('menu_selected_1.4', 'skin/')" id="menu_selected_1.4Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">HOD</div>
177<div id="menu_selected_1.4" class="selectedmenuitemgroup" style="display: block;">
178<div class="menuitem">
179<a href="hod_user_guide.html">User Guide</a>
180</div>
181<div class="menupage">
182<div class="menupagetitle">Admin Guide</div>
183</div>
184<div class="menuitem">
185<a href="hod_config_guide.html">Config Guide</a>
186</div>
187</div>
188<div onclick="SwitchMenu('menu_1.5', 'skin/')" id="menu_1.5Title" class="menutitle">Miscellaneous</div>
189<div id="menu_1.5" class="menuitemgroup">
190<div class="menuitem">
191<a href="api/index.html">API Docs</a>
192</div>
193<div class="menuitem">
194<a href="jdiff/changes.html">API Changes</a>
195</div>
196<div class="menuitem">
197<a href="http://wiki.apache.org/hadoop/">Wiki</a>
198</div>
199<div class="menuitem">
200<a href="http://wiki.apache.org/hadoop/FAQ">FAQ</a>
201</div>
202<div class="menuitem">
203<a href="releasenotes.html">Release Notes</a>
204</div>
205<div class="menuitem">
206<a href="changes.html">Change Log</a>
207</div>
208</div>
209<div id="credit"></div>
210<div id="roundbottom">
211<img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
212<!--+
213  |alternative credits
214  +-->
215<div id="credit2"></div>
216</div>
217<!--+
218    |end Menu
219    +-->
220<!--+
221    |start content
222    +-->
223<div id="content">
224<div title="Portable Document Format" class="pdflink">
225<a class="dida" href="hod_admin_guide.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
226        PDF</a>
227</div>
228<h1> 
229      HOD Administrator Guide
230    </h1>
231<div id="minitoc-area">
232<ul class="minitoc">
233<li>
234<a href="#Overview">Overview</a>
235</li>
236<li>
237<a href="#Pre-requisites">Pre-requisites</a>
238</li>
239<li>
240<a href="#Resource+Manager">Resource Manager</a>
241</li>
242<li>
243<a href="#Installing+HOD">Installing HOD</a>
244</li>
245<li>
246<a href="#Configuring+HOD">Configuring HOD</a>
247<ul class="minitoc">
248<li>
249<a href="#Minimal+Configuration">Minimal Configuration</a>
250</li>
251<li>
252<a href="#Advanced+Configuration">Advanced Configuration</a>
253</li>
254</ul>
255</li>
256<li>
257<a href="#Running+HOD">Running HOD</a>
258</li>
259<li>
260<a href="#Supporting+Tools+and+Utilities">Supporting Tools and Utilities</a>
261<ul class="minitoc">
262<li>
263<a href="#logcondense.py+-+Manage+Log+Files">logcondense.py - Manage Log Files</a>
264<ul class="minitoc">
265<li>
266<a href="#Running+logcondense.py">Running logcondense.py</a>
267</li>
268<li>
269<a href="#Command+Line+Options+for+logcondense.py">Command Line Options for logcondense.py</a>
270</li>
271</ul>
272</li>
273<li>
274<a href="#checklimits.sh+-+Monitor+Resource+Limits">checklimits.sh - Monitor Resource Limits</a>
275<ul class="minitoc">
276<li>
277<a href="#Running+checklimits.sh">Running checklimits.sh</a>
278</li>
279</ul>
280</li>
281<li>
282<a href="#verify-account+-+Script+to+verify+an+account+under+which+%0A+++++++++++++jobs+are+submitted">verify-account - Script to verify an account under which
283             jobs are submitted</a>
284<ul class="minitoc">
285<li>
286<a href="#Integrating+the+verify-account+script+with+HOD">Integrating the verify-account script with HOD</a>
287</li>
288</ul>
289</li>
290</ul>
291</li>
292</ul>
293</div>
294
295<a name="N1000C"></a><a name="Overview"></a>
296<h2 class="h3">Overview</h2>
297<div class="section">
298<p>Hadoop On Demand (HOD) is a system for provisioning and
299managing independent Hadoop Map/Reduce and Hadoop Distributed File System (HDFS)
300instances on a shared cluster
301of nodes. HOD is a tool that makes it easy for administrators and users to
302quickly setup and use Hadoop. HOD is also a very useful tool for Hadoop developers
303and testers who need to share a physical cluster for testing their own Hadoop
304versions.
305</p>
306<p>HOD relies on a resource manager (RM) for allocation of nodes that it can use for
307running Hadoop instances. At present it runs with the <a href="http://www.clusterresources.com/pages/products/torque-resource-manager.php">Torque
308resource manager</a>.
309</p>
310<p>
311The basic system architecture of HOD includes these components:</p>
312<ul>
313 
314<li>A Resource manager (possibly together with a scheduler)</li>
315 
316<li>Various HOD components</li>
317 
318<li>Hadoop Map/Reduce and HDFS daemons</li>
319
320</ul>
321<p>
322HOD provisions and maintains Hadoop Map/Reduce and, optionally, HDFS instances
323through interaction with the above components on a given cluster of nodes. A cluster of
324nodes can be thought of as comprising two sets of nodes:</p>
325<ul>
326 
327<li>Submit nodes: Users use the HOD client on these nodes to allocate clusters, and then
328use the Hadoop client to submit Hadoop jobs. </li>
329 
330<li>Compute nodes: Using the resource manager, HOD components are run on these nodes to
331provision the Hadoop daemons. After that Hadoop jobs run on them.</li>
332
333</ul>
334<p>
335Here is a brief description of the sequence of operations in allocating a cluster and
336running jobs on them.
337</p>
338<ul>
339 
340<li>The user uses the HOD client on the Submit node to allocate a desired number of
341cluster nodes and to provision Hadoop on them.</li>
342 
343<li>The HOD client uses a resource manager interface (qsub, in Torque) to submit a HOD
344process, called the RingMaster, as a Resource Manager job, to request the user's desired number
345of nodes. This job is submitted to the central server of the resource manager (pbs_server, in Torque).</li>
346 
347<li>On the compute nodes, the resource manager slave daemons (pbs_moms in Torque) accept
348and run jobs that they are assigned by the central server (pbs_server in Torque). The RingMaster
349process is started on one of the compute nodes (mother superior, in Torque).</li>
350 
351<li>The RingMaster then uses another resource manager interface (pbsdsh, in Torque) to run
352the second HOD component, HodRing, as distributed tasks on each of the compute
353nodes allocated.</li>
354 
355<li>The HodRings, after initializing, communicate with the RingMaster to get Hadoop commands,
356and run them accordingly. Once the Hadoop commands are started, they register with the RingMaster,
357giving information about the daemons.</li>
358 
359<li>All the configuration files needed for Hadoop instances are generated by HOD itself,
360some obtained from options given by user in its own configuration file.</li>
361 
362<li>The HOD client keeps communicating with the RingMaster to find out the location of the
363JobTracker and HDFS daemons.</li>
364
365</ul>
366<p>This guide shows you how to get started using HOD, reviews various HOD features and command line options, and provides detailed troubleshooting help.</p>
367</div>
368
369
370<a name="N10056"></a><a name="Pre-requisites"></a>
371<h2 class="h3">Pre-requisites</h2>
372<div class="section">
373<p>To use HOD, your system should include the following hardware and software
374components.</p>
375<p>Operating System: HOD is currently tested on RHEL4.<br>
376Nodes : HOD requires a minimum of three nodes configured through a resource manager.<br>
377</p>
378<p> Software </p>
379<p>The following components must be installed on ALL nodes before using HOD:</p>
380<ul>
381 
382<li>
383<a href="http://www.clusterresources.com/pages/products/torque-resource-manager.php">Torque: Resource manager</a>
384</li>
385 
386<li>
387<a href="http://www.python.org">Python</a> : HOD requires version 2.5.1 of Python.</li>
388
389</ul>
390<p>The following components are optional and can be installed to obtain better
391functionality from HOD:</p>
392<ul>
393 
394<li>
395<a href="http://twistedmatrix.com/trac/">Twisted Python</a>: This can be
396  used for improving the scalability of HOD. If this module is detected to be
397  installed, HOD uses it, else it falls back to default modules.</li>
398 
399<li>
400<a href="http://hadoop.apache.org/core/">Hadoop</a>: HOD can automatically
401 distribute Hadoop to all nodes in the cluster. However, it can also use a
402 pre-installed version of Hadoop, if it is available on all nodes in the cluster.
403  HOD currently supports Hadoop 0.15 and above.</li>
404
405</ul>
406<p>NOTE: HOD configuration requires the location of installs of these
407components to be the same on all nodes in the cluster. It will also
408make the configuration simpler to have the same location on the submit
409nodes.
410</p>
411</div>
412
413
414<a name="N1008F"></a><a name="Resource+Manager"></a>
415<h2 class="h3">Resource Manager</h2>
416<div class="section">
417<p>  Currently HOD works with the Torque resource manager, which it uses for its node
418  allocation and job submission. Torque is an open source resource manager from
419  <a href="http://www.clusterresources.com">Cluster Resources</a>, a community effort
420  based on the PBS project. It provides control over batch jobs and distributed compute nodes. Torque is
421  freely available for download from <a href="http://www.clusterresources.com/downloads/torque/">here</a>.
422  </p>
423<p>  All documentation related to torque can be seen under
424  the section TORQUE Resource Manager <a href="http://www.clusterresources.com/pages/resources/documentation.php">here</a>. You can
425  get wiki documentation from <a href="http://www.clusterresources.com/wiki/doku.php?id=torque:torque_wiki">here</a>.
426  Users may wish to subscribe to TORQUE&rsquo;s mailing list or view the archive for questions,
427  comments <a href="http://www.clusterresources.com/pages/resources/mailing-lists.php">here</a>.
428</p>
429<p>To use HOD with Torque:</p>
430<ul>
431 
432<li>Install Torque components: pbs_server on one node (head node), pbs_mom on all
433  compute nodes, and PBS client tools on all compute nodes and submit
434  nodes. Perform at least a basic configuration so that the Torque system is up and
435  running, that is, pbs_server knows which machines to talk to. Look <a href="http://www.clusterresources.com/wiki/doku.php?id=torque:1.2_basic_configuration">here</a>
436  for basic configuration.
437
438  For advanced configuration, see <a href="http://www.clusterresources.com/wiki/doku.php?id=torque:1.3_advanced_configuration">here</a>
439</li>
440 
441<li>Create a queue for submitting jobs on the pbs_server. The name of the queue is the
442  same as the HOD configuration parameter, resource-manager.queue. The HOD client uses this queue to
443  submit the RingMaster process as a Torque job.</li>
444 
445<li>Specify a cluster name as a property for all nodes in the cluster.
446  This can be done by using the qmgr command. For example:
447  <span class="codefrag">qmgr -c "set node node properties=cluster-name"</span>. The name of the cluster is the same as
448  the HOD configuration parameter, hod.cluster. </li>
449 
450<li>Make sure that jobs can be submitted to the nodes. This can be done by
451  using the qsub command. For example:
452  <span class="codefrag">echo "sleep 30" | qsub -l nodes=3</span>
453</li>
454
455</ul>
456</div>
457
458
459<a name="N100CE"></a><a name="Installing+HOD"></a>
460<h2 class="h3">Installing HOD</h2>
461<div class="section">
462<p>Once the resource manager is set up, you can obtain and
463install HOD.</p>
464<ul>
465 
466<li>If you are getting HOD from the Hadoop tarball, it is available under the
467  'contrib' section of Hadoop, under the root  directory 'hod'.</li>
468 
469<li>If you are building from source, you can run ant tar from the Hadoop root
470  directory to generate the Hadoop tarball, and then get HOD from there,
471  as described above.</li>
472 
473<li>Distribute the files under this directory to all the nodes in the
474  cluster. Note that the location where the files are copied should be
475  the same on all the nodes.</li>
476 
477<li>Note that compiling hadoop would build HOD with appropriate permissions
478  set on all the required script files in HOD.</li>
479
480</ul>
481</div>
482
483
484<a name="N100E7"></a><a name="Configuring+HOD"></a>
485<h2 class="h3">Configuring HOD</h2>
486<div class="section">
487<p>You can configure HOD once it is installed. The minimal configuration needed
488to run HOD is described below. More advanced configuration options are discussed
489in the HOD Configuration Guide.</p>
490<a name="N100F0"></a><a name="Minimal+Configuration"></a>
491<h3 class="h4">Minimal Configuration</h3>
492<p>To get started using HOD, the following minimal configuration is
493  required:</p>
494<ul>
495 
496<li>On the node from where you want to run HOD, edit the file hodrc
497  located in the &lt;install dir&gt;/conf directory. This file
498  contains the minimal set of values required to run hod.</li>
499 
500<li>
501
502<p>Specify values suitable to your environment for the following
503  variables defined in the configuration file. Note that some of these
504  variables are defined at more than one place in the file.</p>
505
506 
507<ul>
508   
509<li>${JAVA_HOME}: Location of Java for Hadoop. Hadoop supports Sun JDK
510    1.6.x and above.</li>
511   
512<li>${CLUSTER_NAME}: Name of the cluster which is specified in the
513    'node property' as mentioned in resource manager configuration.</li>
514   
515<li>${HADOOP_HOME}: Location of Hadoop installation on the compute and
516    submit nodes.</li>
517   
518<li>${RM_QUEUE}: Queue configured for submitting jobs in the resource
519    manager configuration.</li>
520   
521<li>${RM_HOME}: Location of the resource manager installation on the
522    compute and submit nodes.</li>
523   
524</ul>
525
526</li>
527
528
529<li>
530
531<p>The following environment variables may need to be set depending on
532  your environment. These variables must be defined where you run the
533  HOD client and must also be specified in the HOD configuration file as the
534  value of the key resource_manager.env-vars. Multiple variables can be
535  specified as a comma separated list of key=value pairs.</p>
536
537 
538<ul>
539   
540<li>HOD_PYTHON_HOME: If you install python to a non-default location
541    of the compute nodes, or submit nodes, then this variable must be
542    defined to point to the python executable in the non-standard
543    location.</li>
544   
545</ul>
546
547</li>
548
549</ul>
550<a name="N10124"></a><a name="Advanced+Configuration"></a>
551<h3 class="h4">Advanced Configuration</h3>
552<p> You can review and modify other configuration options to suit
553 your specific needs. Refer to the <a href="hod_config_guide.html">HOD Configuration
554 Guide</a> for more information.</p>
555</div>
556
557 
558<a name="N10133"></a><a name="Running+HOD"></a>
559<h2 class="h3">Running HOD</h2>
560<div class="section">
561<p>You can run HOD once it is configured. Refer to the<a href="hod_user_guide.html"> HOD User Guide</a> for more information.</p>
562</div>
563
564 
565<a name="N10141"></a><a name="Supporting+Tools+and+Utilities"></a>
566<h2 class="h3">Supporting Tools and Utilities</h2>
567<div class="section">
568<p>This section describes supporting tools and utilities that can be used to
569    manage HOD deployments.</p>
570<a name="N1014A"></a><a name="logcondense.py+-+Manage+Log+Files"></a>
571<h3 class="h4">logcondense.py - Manage Log Files</h3>
572<p>As mentioned in the
573         <a href="hod_user_guide.html#Collecting+and+Viewing+Hadoop+Logs">HOD User Guide</a>,
574         HOD can be configured to upload
575         Hadoop logs to a statically configured HDFS. Over time, the number of logs uploaded
576         to HDFS could increase. logcondense.py is a tool that helps
577         administrators to remove log files uploaded to HDFS. </p>
578<a name="N10157"></a><a name="Running+logcondense.py"></a>
579<h4>Running logcondense.py</h4>
580<p>logcondense.py is available under hod_install_location/support folder. You can either
581        run it using python, for example, <em>python logcondense.py</em>, or give execute permissions
582        to the file, and directly run it as <em>logcondense.py</em>. logcondense.py needs to be
583        run by a user who has sufficient permissions to remove files from locations where log
584        files are uploaded in the HDFS, if permissions are enabled. For example as mentioned in the
585        <a href="hod_config_guide.html#3.7+hodring+options">HOD Configuration Guide</a>, the logs could
586        be configured to come under the user's home directory in HDFS. In that case, the user
587        running logcondense.py should have super user privileges to remove the files from under
588        all user home directories.</p>
589<a name="N1016B"></a><a name="Command+Line+Options+for+logcondense.py"></a>
590<h4>Command Line Options for logcondense.py</h4>
591<p>The following command line options are supported for logcondense.py.</p>
592<table class="ForrestTable" cellspacing="1" cellpadding="4">
593           
594<tr>
595             
596<td colspan="1" rowspan="1">Short Option</td>
597              <td colspan="1" rowspan="1">Long option</td>
598              <td colspan="1" rowspan="1">Meaning</td>
599              <td colspan="1" rowspan="1">Example</td>
600           
601</tr>
602           
603<tr>
604             
605<td colspan="1" rowspan="1">-p</td>
606              <td colspan="1" rowspan="1">--package</td>
607              <td colspan="1" rowspan="1">Complete path to the hadoop script. The version of hadoop must be the same as the
608                  one running HDFS.</td>
609              <td colspan="1" rowspan="1">/usr/bin/hadoop</td>
610           
611</tr>
612           
613<tr>
614             
615<td colspan="1" rowspan="1">-d</td>
616              <td colspan="1" rowspan="1">--days</td>
617              <td colspan="1" rowspan="1">Delete log files older than the specified number of days</td>
618              <td colspan="1" rowspan="1">7</td>
619           
620</tr>
621           
622<tr>
623             
624<td colspan="1" rowspan="1">-c</td>
625              <td colspan="1" rowspan="1">--config</td>
626              <td colspan="1" rowspan="1">Path to the Hadoop configuration directory, under which hadoop-site.xml resides.
627              The hadoop-site.xml must point to the HDFS NameNode from where logs are to be removed.</td>
628              <td colspan="1" rowspan="1">/home/foo/hadoop/conf</td>
629           
630</tr>
631           
632<tr>
633             
634<td colspan="1" rowspan="1">-l</td>
635              <td colspan="1" rowspan="1">--logs</td>
636              <td colspan="1" rowspan="1">A HDFS path, this must be the same HDFS path as specified for the log-destination-uri,
637              as mentioned in the  <a href="hod_config_guide.html#3.7+hodring+options">HOD Configuration Guide</a>,
638              without the hdfs:// URI string</td>
639              <td colspan="1" rowspan="1">/user</td>
640           
641</tr>
642           
643<tr>
644             
645<td colspan="1" rowspan="1">-n</td>
646              <td colspan="1" rowspan="1">--dynamicdfs</td>
647              <td colspan="1" rowspan="1">If true, this will indicate that the logcondense.py script should delete HDFS logs
648              in addition to Map/Reduce logs. Otherwise, it only deletes Map/Reduce logs, which is also the
649              default if this option is not specified. This option is useful if
650              dynamic HDFS installations
651              are being provisioned by HOD, and the static HDFS installation is being used only to collect
652              logs - a scenario that may be common in test clusters.</td>
653              <td colspan="1" rowspan="1">false</td>
654           
655</tr>
656         
657</table>
658<p>So, for example, to delete all log files older than 7 days using a hadoop-site.xml stored in
659        ~/hadoop-conf, using the hadoop installation under ~/hadoop-0.17.0, you could say:</p>
660<p>
661<em>python logcondense.py -p ~/hadoop-0.17.0/bin/hadoop -d 7 -c ~/hadoop-conf -l /user</em>
662</p>
663<a name="N1020E"></a><a name="checklimits.sh+-+Monitor+Resource+Limits"></a>
664<h3 class="h4">checklimits.sh - Monitor Resource Limits</h3>
665<p>checklimits.sh is a HOD tool specific to the Torque/Maui environment
666      (<a href="http://www.clusterresources.com/pages/products/maui-cluster-scheduler.php">Maui Cluster Scheduler</a> is an open source job
667      scheduler for clusters and supercomputers, from clusterresources). The
668      checklimits.sh script
669      updates the torque comment field when newly submitted job(s) violate or
670      exceed
671      over user limits set up in Maui scheduler. It uses qstat, does one pass
672      over the torque job-list to determine queued or unfinished jobs, runs Maui
673      tool checkjob on each job to see if user limits are violated and then
674      runs torque's qalter utility to update job attribute 'comment'. Currently
675      it updates the comment as <em>User-limits exceeded. Requested:([0-9]*)
676      Used:([0-9]*) MaxLimit:([0-9]*)</em> for those jobs that violate limits.
677      This comment field is then used by HOD to behave accordingly depending on
678      the type of violation.</p>
679<a name="N1021E"></a><a name="Running+checklimits.sh"></a>
680<h4>Running checklimits.sh</h4>
681<p>checklimits.sh is available under the hod_install_location/support
682        folder. This shell script can be run directly as <em>sh
683        checklimits.sh </em>or as <em>./checklimits.sh</em> after enabling
684        execute permissions. Torque and Maui binaries should be available
685        on the machine where the tool is run and should be in the path
686        of the shell script process. To update the
687        comment field of jobs from different users, this tool must be run with
688        torque administrative privileges. This tool must be run repeatedly
689        after specific intervals of time to frequently update jobs violating
690        constraints, for example via cron. Please note that the resource manager
691        and scheduler commands used in this script can be expensive and so
692        it is better not to run this inside a tight loop without sleeping.</p>
693<a name="N1022F"></a><a name="verify-account+-+Script+to+verify+an+account+under+which+%0A+++++++++++++jobs+are+submitted"></a>
694<h3 class="h4">verify-account - Script to verify an account under which
695             jobs are submitted</h3>
696<p>Production systems use accounting packages to charge users for using
697      shared compute resources. HOD supports a parameter
698      <em>resource_manager.pbs-account</em> to allow users to identify the
699      account under which they would like to submit jobs. It may be necessary
700      to verify that this account is a valid one configured in an accounting
701      system. The <em>hod-install-dir/bin/verify-account</em> script
702      provides a mechanism to plug-in a custom script that can do this
703      verification.</p>
704<a name="N1023E"></a><a name="Integrating+the+verify-account+script+with+HOD"></a>
705<h4>Integrating the verify-account script with HOD</h4>
706<p>HOD runs the <em>verify-account</em> script passing in the
707        <em>resource_manager.pbs-account</em> value as argument to the script,
708        before allocating a cluster. Sites can write a script that verify this
709        account against their accounting systems. Returning a non-zero exit
710        code from this script will cause HOD to fail allocation. Also, in
711        case of an error, HOD will print the output of script to the user.
712        Any descriptive error message can be passed to the user from the
713        script in this manner.</p>
714<p>The default script that comes with the HOD installation does not
715        do any validation, and returns a zero exit code.</p>
716<p>If the verify-account script is not found, then HOD will treat
717        that verification is disabled, and continue allocation as is.</p>
718</div>
719
720
721</div>
722<!--+
723    |end content
724    +-->
725<div class="clearboth">&nbsp;</div>
726</div>
727<div id="footer">
728<!--+
729    |start bottomstrip
730    +-->
731<div class="lastmodified">
732<script type="text/javascript"><!--
733document.write("Last Published: " + document.lastModified);
734//  --></script>
735</div>
736<div class="copyright">
737        Copyright &copy;
738         2008 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
739</div>
740<!--+
741    |end bottomstrip
742    +-->
743</div>
744</body>
745</html>
Note: See TracBrowser for help on using the repository browser.