source: proiecte/HadoopJUnit/hadoop-0.20.1/docs/api/org/apache/hadoop/examples/terasort/package-summary.html @ 120

Last change on this file since 120 was 120, checked in by (none), 14 years ago

Added the mail files for the Hadoop JUNit Project

  • Property svn:executable set to *
File size: 12.0 KB
Line 
1<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
2<!--NewPage-->
3<HTML>
4<HEAD>
5<!-- Generated by javadoc (build 1.6.0_07) on Tue Sep 01 20:56:59 UTC 2009 -->
6<TITLE>
7org.apache.hadoop.examples.terasort (Hadoop 0.20.1 API)
8</TITLE>
9
10<META NAME="date" CONTENT="2009-09-01">
11
12<LINK REL ="stylesheet" TYPE="text/css" HREF="../../../../../stylesheet.css" TITLE="Style">
13
14<SCRIPT type="text/javascript">
15function windowTitle()
16{
17    if (location.href.indexOf('is-external=true') == -1) {
18        parent.document.title="org.apache.hadoop.examples.terasort (Hadoop 0.20.1 API)";
19    }
20}
21</SCRIPT>
22<NOSCRIPT>
23</NOSCRIPT>
24
25</HEAD>
26
27<BODY BGCOLOR="white" onload="windowTitle();">
28<HR>
29
30
31<!-- ========= START OF TOP NAVBAR ======= -->
32<A NAME="navbar_top"><!-- --></A>
33<A HREF="#skip-navbar_top" title="Skip navigation links"></A>
34<TABLE BORDER="0" WIDTH="100%" CELLPADDING="1" CELLSPACING="0" SUMMARY="">
35<TR>
36<TD COLSPAN=2 BGCOLOR="#EEEEFF" CLASS="NavBarCell1">
37<A NAME="navbar_top_firstrow"><!-- --></A>
38<TABLE BORDER="0" CELLPADDING="0" CELLSPACING="3" SUMMARY="">
39  <TR ALIGN="center" VALIGN="top">
40  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="../../../../../overview-summary.html"><FONT CLASS="NavBarFont1"><B>Overview</B></FONT></A>&nbsp;</TD>
41  <TD BGCOLOR="#FFFFFF" CLASS="NavBarCell1Rev"> &nbsp;<FONT CLASS="NavBarFont1Rev"><B>Package</B></FONT>&nbsp;</TD>
42  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <FONT CLASS="NavBarFont1">Class</FONT>&nbsp;</TD>
43  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="package-use.html"><FONT CLASS="NavBarFont1"><B>Use</B></FONT></A>&nbsp;</TD>
44  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="package-tree.html"><FONT CLASS="NavBarFont1"><B>Tree</B></FONT></A>&nbsp;</TD>
45  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="../../../../../deprecated-list.html"><FONT CLASS="NavBarFont1"><B>Deprecated</B></FONT></A>&nbsp;</TD>
46  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="../../../../../index-all.html"><FONT CLASS="NavBarFont1"><B>Index</B></FONT></A>&nbsp;</TD>
47  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="../../../../../help-doc.html"><FONT CLASS="NavBarFont1"><B>Help</B></FONT></A>&nbsp;</TD>
48  </TR>
49</TABLE>
50</TD>
51<TD ALIGN="right" VALIGN="top" ROWSPAN=3><EM>
52</EM>
53</TD>
54</TR>
55
56<TR>
57<TD BGCOLOR="white" CLASS="NavBarCell2"><FONT SIZE="-2">
58&nbsp;<A HREF="../../../../../org/apache/hadoop/examples/dancing/package-summary.html"><B>PREV PACKAGE</B></A>&nbsp;
59&nbsp;<A HREF="../../../../../org/apache/hadoop/filecache/package-summary.html"><B>NEXT PACKAGE</B></A></FONT></TD>
60<TD BGCOLOR="white" CLASS="NavBarCell2"><FONT SIZE="-2">
61  <A HREF="../../../../../index.html?org/apache/hadoop/examples/terasort/package-summary.html" target="_top"><B>FRAMES</B></A>  &nbsp;
62&nbsp;<A HREF="package-summary.html" target="_top"><B>NO FRAMES</B></A>  &nbsp;
63&nbsp;<SCRIPT type="text/javascript">
64  <!--
65  if(window==top) {
66    document.writeln('<A HREF="../../../../../allclasses-noframe.html"><B>All Classes</B></A>');
67  }
68  //-->
69</SCRIPT>
70<NOSCRIPT>
71  <A HREF="../../../../../allclasses-noframe.html"><B>All Classes</B></A>
72</NOSCRIPT>
73
74
75</FONT></TD>
76</TR>
77</TABLE>
78<A NAME="skip-navbar_top"></A>
79<!-- ========= END OF TOP NAVBAR ========= -->
80
81<HR>
82<H2>
83Package org.apache.hadoop.examples.terasort
84</H2>
85This package consists of 3 map/reduce applications for Hadoop to
86compete in the annual <a
87href="http://www.hpl.hp.com/hosted/sortbenchmark" target="_top">terabyte sort</a>
88competition.
89<P>
90<B>See:</B>
91<BR>
92&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<A HREF="#package_description"><B>Description</B></A>
93<P>
94
95<TABLE BORDER="1" WIDTH="100%" CELLPADDING="3" CELLSPACING="0" SUMMARY="">
96<TR BGCOLOR="#CCCCFF" CLASS="TableHeadingColor">
97<TH ALIGN="left" COLSPAN="2"><FONT SIZE="+2">
98<B>Class Summary</B></FONT></TH>
99</TR>
100<TR BGCOLOR="white" CLASS="TableRowColor">
101<TD WIDTH="15%"><B><A HREF="../../../../../org/apache/hadoop/examples/terasort/TeraGen.html" title="class in org.apache.hadoop.examples.terasort">TeraGen</A></B></TD>
102<TD>Generate the official terasort input data set.</TD>
103</TR>
104<TR BGCOLOR="white" CLASS="TableRowColor">
105<TD WIDTH="15%"><B><A HREF="../../../../../org/apache/hadoop/examples/terasort/TeraGen.SortGenMapper.html" title="class in org.apache.hadoop.examples.terasort">TeraGen.SortGenMapper</A></B></TD>
106<TD>The Mapper class that given a row number, will generate the appropriate
107 output line.</TD>
108</TR>
109<TR BGCOLOR="white" CLASS="TableRowColor">
110<TD WIDTH="15%"><B><A HREF="../../../../../org/apache/hadoop/examples/terasort/TeraInputFormat.html" title="class in org.apache.hadoop.examples.terasort">TeraInputFormat</A></B></TD>
111<TD>An input format that reads the first 10 characters of each line as the key
112 and the rest of the line as the value.</TD>
113</TR>
114<TR BGCOLOR="white" CLASS="TableRowColor">
115<TD WIDTH="15%"><B><A HREF="../../../../../org/apache/hadoop/examples/terasort/TeraOutputFormat.html" title="class in org.apache.hadoop.examples.terasort">TeraOutputFormat</A></B></TD>
116<TD>A streamlined text output format that writes key, value, and "\r\n".</TD>
117</TR>
118<TR BGCOLOR="white" CLASS="TableRowColor">
119<TD WIDTH="15%"><B><A HREF="../../../../../org/apache/hadoop/examples/terasort/TeraSort.html" title="class in org.apache.hadoop.examples.terasort">TeraSort</A></B></TD>
120<TD>Generates the sampled split points, launches the job, and waits for it to
121 finish.</TD>
122</TR>
123<TR BGCOLOR="white" CLASS="TableRowColor">
124<TD WIDTH="15%"><B><A HREF="../../../../../org/apache/hadoop/examples/terasort/TeraValidate.html" title="class in org.apache.hadoop.examples.terasort">TeraValidate</A></B></TD>
125<TD>Generate 1 mapper per a file that checks to make sure the keys
126 are sorted within each file.</TD>
127</TR>
128</TABLE>
129&nbsp;
130
131<P>
132<A NAME="package_description"><!-- --></A><H2>
133Package org.apache.hadoop.examples.terasort Description
134</H2>
135
136<P>
137This package consists of 3 map/reduce applications for Hadoop to
138compete in the annual <a
139href="http://www.hpl.hp.com/hosted/sortbenchmark" target="_top">terabyte sort</a>
140competition.
141
142<ul>
143<li><b>TeraGen</b> is a map/reduce program to generate the data.
144<li><b>TeraSort</b> samples the input data and uses map/reduce to
145    sort the data into a total order.
146<li><b>TeraValidate</b> is a map/reduce program that validates the
147    output is sorted.
148</ul>
149
150<p>
151
152<b>TeraGen</b> generates output data that is byte for byte
153equivalent to the C version including the newlines and specific
154keys. It divides the desired number of rows by the desired number of
155tasks and assigns ranges of rows to each map. The map jumps the random
156number generator to the correct value for the first row and generates
157the following rows.
158
159<p>
160
161<b>TeraSort</b> is a standard map/reduce sort, except for a custom
162partitioner that uses a sorted list of <i>N-1</i> sampled keys that define
163the key range for each reduce. In particular, all keys such that
164<i>sample[i-1] &lt;= key &lt; sample[i]</i> are sent to reduce
165<i>i</i>. This guarantees that the output of reduce <i>i</i> are all
166less than the output of reduce <i>i+1</i>. To speed up the
167partitioning, the partitioner builds a two level trie that quickly
168indexes into the list of sample keys based on the first two bytes of
169the key. TeraSort generates the sample keys by sampling the input
170before the job is submitted and writing the list of keys into HDFS.
171The input and output format, which are used by all 3 applications,
172read and write the text files in the right format. The output of the
173reduce has replication set to 1, instead of the default 3, because the
174contest does not require the output data be replicated on to multiple
175nodes. 
176
177<p>
178
179<b>TeraValidate</b> ensures that the output is globally sorted. It
180creates one map per a file in the output directory and each map ensures that
181each key is less than or equal to the previous one. The map also generates
182records with the first and last keys of the file and the reduce
183ensures that the first key of file <i>i</i> is greater that the last key of
184file <i>i-1</i>. Any problems are reported as output of the reduce with the
185keys that are out of order.
186
187<p>
188
189In May 2008, Owen O'Malley ran this code on a 910 node cluster and
190sorted the 10 billion records (1 TB) in 209 seconds (3.48 minutes) to
191win the annual general purpose (daytona)
192<a href="http://www.hpl.hp.com/hosted/sortbenchmark/">terabyte sort
193benchmark</a>.
194
195<p>
196
197The cluster statistics were:
198<ul>
199<li> 910 nodes
200<li> 4 dual core Xeons @ 2.0ghz per a node
201<li> 4 SATA disks per a node
202<li> 8G RAM per a node
203<li> 1 gigabit ethernet on each node
204<li> 40 nodes per a rack
205<li> 8 gigabit ethernet uplinks from each rack to the core
206<li> Red Hat Enterprise Linux Server Release 5.1 (kernel 2.6.18)
207<li> Sun Java JDK 1.6.0_05-b13
208</ul>
209
210<p>
211
212The test was on Hadoop trunk (pre-0.18) patched with <a
213href="http://issues.apache.org/jira/browse/HADOOP-3443">HADOOP-3443</a>
214and <a
215href="http://issues.apache.org/jira/browse/HADOOP-3446">HADOOP-3446</a>,
216which were required to remove intermediate writes to disk.
217TeraGen used
2181800 tasks to generate a total of 10 billion rows in HDFS, with a
219block size of 1024 MB.
220TeraSort was configured with 1800 maps and 1800 reduces, and
221<i>io.sort.mb</i>,
222<i>io.sort.factor</i>, <i>fs.inmemory.size.mb</i>, and task heap size
223sufficient that transient data was never spilled to disk, other at the
224end of the map. The sampler looked at 100,000 keys to determine the
225reduce boundaries, which lead to imperfect balancing with reduce
226outputs ranging from 337 MB to 872 MB.
227<P>
228
229<P>
230<DL>
231</DL>
232<HR>
233
234
235<!-- ======= START OF BOTTOM NAVBAR ====== -->
236<A NAME="navbar_bottom"><!-- --></A>
237<A HREF="#skip-navbar_bottom" title="Skip navigation links"></A>
238<TABLE BORDER="0" WIDTH="100%" CELLPADDING="1" CELLSPACING="0" SUMMARY="">
239<TR>
240<TD COLSPAN=2 BGCOLOR="#EEEEFF" CLASS="NavBarCell1">
241<A NAME="navbar_bottom_firstrow"><!-- --></A>
242<TABLE BORDER="0" CELLPADDING="0" CELLSPACING="3" SUMMARY="">
243  <TR ALIGN="center" VALIGN="top">
244  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="../../../../../overview-summary.html"><FONT CLASS="NavBarFont1"><B>Overview</B></FONT></A>&nbsp;</TD>
245  <TD BGCOLOR="#FFFFFF" CLASS="NavBarCell1Rev"> &nbsp;<FONT CLASS="NavBarFont1Rev"><B>Package</B></FONT>&nbsp;</TD>
246  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <FONT CLASS="NavBarFont1">Class</FONT>&nbsp;</TD>
247  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="package-use.html"><FONT CLASS="NavBarFont1"><B>Use</B></FONT></A>&nbsp;</TD>
248  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="package-tree.html"><FONT CLASS="NavBarFont1"><B>Tree</B></FONT></A>&nbsp;</TD>
249  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="../../../../../deprecated-list.html"><FONT CLASS="NavBarFont1"><B>Deprecated</B></FONT></A>&nbsp;</TD>
250  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="../../../../../index-all.html"><FONT CLASS="NavBarFont1"><B>Index</B></FONT></A>&nbsp;</TD>
251  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="../../../../../help-doc.html"><FONT CLASS="NavBarFont1"><B>Help</B></FONT></A>&nbsp;</TD>
252  </TR>
253</TABLE>
254</TD>
255<TD ALIGN="right" VALIGN="top" ROWSPAN=3><EM>
256</EM>
257</TD>
258</TR>
259
260<TR>
261<TD BGCOLOR="white" CLASS="NavBarCell2"><FONT SIZE="-2">
262&nbsp;<A HREF="../../../../../org/apache/hadoop/examples/dancing/package-summary.html"><B>PREV PACKAGE</B></A>&nbsp;
263&nbsp;<A HREF="../../../../../org/apache/hadoop/filecache/package-summary.html"><B>NEXT PACKAGE</B></A></FONT></TD>
264<TD BGCOLOR="white" CLASS="NavBarCell2"><FONT SIZE="-2">
265  <A HREF="../../../../../index.html?org/apache/hadoop/examples/terasort/package-summary.html" target="_top"><B>FRAMES</B></A>  &nbsp;
266&nbsp;<A HREF="package-summary.html" target="_top"><B>NO FRAMES</B></A>  &nbsp;
267&nbsp;<SCRIPT type="text/javascript">
268  <!--
269  if(window==top) {
270    document.writeln('<A HREF="../../../../../allclasses-noframe.html"><B>All Classes</B></A>');
271  }
272  //-->
273</SCRIPT>
274<NOSCRIPT>
275  <A HREF="../../../../../allclasses-noframe.html"><B>All Classes</B></A>
276</NOSCRIPT>
277
278
279</FONT></TD>
280</TR>
281</TABLE>
282<A NAME="skip-navbar_bottom"></A>
283<!-- ======== END OF BOTTOM NAVBAR ======= -->
284
285<HR>
286Copyright &copy; 2009 The Apache Software Foundation
287</BODY>
288</HTML>
Note: See TracBrowser for help on using the repository browser.