1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> |
---|
2 | <!--NewPage--> |
---|
3 | <HTML> |
---|
4 | <HEAD> |
---|
5 | <!-- Generated by javadoc (build 1.6.0_07) on Tue Sep 01 20:56:59 UTC 2009 --> |
---|
6 | <TITLE> |
---|
7 | org.apache.hadoop.examples.terasort (Hadoop 0.20.1 API) |
---|
8 | </TITLE> |
---|
9 | |
---|
10 | <META NAME="date" CONTENT="2009-09-01"> |
---|
11 | |
---|
12 | <LINK REL ="stylesheet" TYPE="text/css" HREF="../../../../../stylesheet.css" TITLE="Style"> |
---|
13 | |
---|
14 | <SCRIPT type="text/javascript"> |
---|
15 | function windowTitle() |
---|
16 | { |
---|
17 | if (location.href.indexOf('is-external=true') == -1) { |
---|
18 | parent.document.title="org.apache.hadoop.examples.terasort (Hadoop 0.20.1 API)"; |
---|
19 | } |
---|
20 | } |
---|
21 | </SCRIPT> |
---|
22 | <NOSCRIPT> |
---|
23 | </NOSCRIPT> |
---|
24 | |
---|
25 | </HEAD> |
---|
26 | |
---|
27 | <BODY BGCOLOR="white" onload="windowTitle();"> |
---|
28 | <HR> |
---|
29 | |
---|
30 | |
---|
31 | <!-- ========= START OF TOP NAVBAR ======= --> |
---|
32 | <A NAME="navbar_top"><!-- --></A> |
---|
33 | <A HREF="#skip-navbar_top" title="Skip navigation links"></A> |
---|
34 | <TABLE BORDER="0" WIDTH="100%" CELLPADDING="1" CELLSPACING="0" SUMMARY=""> |
---|
35 | <TR> |
---|
36 | <TD COLSPAN=2 BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> |
---|
37 | <A NAME="navbar_top_firstrow"><!-- --></A> |
---|
38 | <TABLE BORDER="0" CELLPADDING="0" CELLSPACING="3" SUMMARY=""> |
---|
39 | <TR ALIGN="center" VALIGN="top"> |
---|
40 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../../overview-summary.html"><FONT CLASS="NavBarFont1"><B>Overview</B></FONT></A> </TD> |
---|
41 | <TD BGCOLOR="#FFFFFF" CLASS="NavBarCell1Rev"> <FONT CLASS="NavBarFont1Rev"><B>Package</B></FONT> </TD> |
---|
42 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <FONT CLASS="NavBarFont1">Class</FONT> </TD> |
---|
43 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="package-use.html"><FONT CLASS="NavBarFont1"><B>Use</B></FONT></A> </TD> |
---|
44 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="package-tree.html"><FONT CLASS="NavBarFont1"><B>Tree</B></FONT></A> </TD> |
---|
45 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../../deprecated-list.html"><FONT CLASS="NavBarFont1"><B>Deprecated</B></FONT></A> </TD> |
---|
46 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../../index-all.html"><FONT CLASS="NavBarFont1"><B>Index</B></FONT></A> </TD> |
---|
47 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../../help-doc.html"><FONT CLASS="NavBarFont1"><B>Help</B></FONT></A> </TD> |
---|
48 | </TR> |
---|
49 | </TABLE> |
---|
50 | </TD> |
---|
51 | <TD ALIGN="right" VALIGN="top" ROWSPAN=3><EM> |
---|
52 | </EM> |
---|
53 | </TD> |
---|
54 | </TR> |
---|
55 | |
---|
56 | <TR> |
---|
57 | <TD BGCOLOR="white" CLASS="NavBarCell2"><FONT SIZE="-2"> |
---|
58 | <A HREF="../../../../../org/apache/hadoop/examples/dancing/package-summary.html"><B>PREV PACKAGE</B></A> |
---|
59 | <A HREF="../../../../../org/apache/hadoop/filecache/package-summary.html"><B>NEXT PACKAGE</B></A></FONT></TD> |
---|
60 | <TD BGCOLOR="white" CLASS="NavBarCell2"><FONT SIZE="-2"> |
---|
61 | <A HREF="../../../../../index.html?org/apache/hadoop/examples/terasort/package-summary.html" target="_top"><B>FRAMES</B></A> |
---|
62 | <A HREF="package-summary.html" target="_top"><B>NO FRAMES</B></A> |
---|
63 | <SCRIPT type="text/javascript"> |
---|
64 | <!-- |
---|
65 | if(window==top) { |
---|
66 | document.writeln('<A HREF="../../../../../allclasses-noframe.html"><B>All Classes</B></A>'); |
---|
67 | } |
---|
68 | //--> |
---|
69 | </SCRIPT> |
---|
70 | <NOSCRIPT> |
---|
71 | <A HREF="../../../../../allclasses-noframe.html"><B>All Classes</B></A> |
---|
72 | </NOSCRIPT> |
---|
73 | |
---|
74 | |
---|
75 | </FONT></TD> |
---|
76 | </TR> |
---|
77 | </TABLE> |
---|
78 | <A NAME="skip-navbar_top"></A> |
---|
79 | <!-- ========= END OF TOP NAVBAR ========= --> |
---|
80 | |
---|
81 | <HR> |
---|
82 | <H2> |
---|
83 | Package org.apache.hadoop.examples.terasort |
---|
84 | </H2> |
---|
85 | This package consists of 3 map/reduce applications for Hadoop to |
---|
86 | compete in the annual <a |
---|
87 | href="http://www.hpl.hp.com/hosted/sortbenchmark" target="_top">terabyte sort</a> |
---|
88 | competition. |
---|
89 | <P> |
---|
90 | <B>See:</B> |
---|
91 | <BR> |
---|
92 | <A HREF="#package_description"><B>Description</B></A> |
---|
93 | <P> |
---|
94 | |
---|
95 | <TABLE BORDER="1" WIDTH="100%" CELLPADDING="3" CELLSPACING="0" SUMMARY=""> |
---|
96 | <TR BGCOLOR="#CCCCFF" CLASS="TableHeadingColor"> |
---|
97 | <TH ALIGN="left" COLSPAN="2"><FONT SIZE="+2"> |
---|
98 | <B>Class Summary</B></FONT></TH> |
---|
99 | </TR> |
---|
100 | <TR BGCOLOR="white" CLASS="TableRowColor"> |
---|
101 | <TD WIDTH="15%"><B><A HREF="../../../../../org/apache/hadoop/examples/terasort/TeraGen.html" title="class in org.apache.hadoop.examples.terasort">TeraGen</A></B></TD> |
---|
102 | <TD>Generate the official terasort input data set.</TD> |
---|
103 | </TR> |
---|
104 | <TR BGCOLOR="white" CLASS="TableRowColor"> |
---|
105 | <TD WIDTH="15%"><B><A HREF="../../../../../org/apache/hadoop/examples/terasort/TeraGen.SortGenMapper.html" title="class in org.apache.hadoop.examples.terasort">TeraGen.SortGenMapper</A></B></TD> |
---|
106 | <TD>The Mapper class that given a row number, will generate the appropriate |
---|
107 | output line.</TD> |
---|
108 | </TR> |
---|
109 | <TR BGCOLOR="white" CLASS="TableRowColor"> |
---|
110 | <TD WIDTH="15%"><B><A HREF="../../../../../org/apache/hadoop/examples/terasort/TeraInputFormat.html" title="class in org.apache.hadoop.examples.terasort">TeraInputFormat</A></B></TD> |
---|
111 | <TD>An input format that reads the first 10 characters of each line as the key |
---|
112 | and the rest of the line as the value.</TD> |
---|
113 | </TR> |
---|
114 | <TR BGCOLOR="white" CLASS="TableRowColor"> |
---|
115 | <TD WIDTH="15%"><B><A HREF="../../../../../org/apache/hadoop/examples/terasort/TeraOutputFormat.html" title="class in org.apache.hadoop.examples.terasort">TeraOutputFormat</A></B></TD> |
---|
116 | <TD>A streamlined text output format that writes key, value, and "\r\n".</TD> |
---|
117 | </TR> |
---|
118 | <TR BGCOLOR="white" CLASS="TableRowColor"> |
---|
119 | <TD WIDTH="15%"><B><A HREF="../../../../../org/apache/hadoop/examples/terasort/TeraSort.html" title="class in org.apache.hadoop.examples.terasort">TeraSort</A></B></TD> |
---|
120 | <TD>Generates the sampled split points, launches the job, and waits for it to |
---|
121 | finish.</TD> |
---|
122 | </TR> |
---|
123 | <TR BGCOLOR="white" CLASS="TableRowColor"> |
---|
124 | <TD WIDTH="15%"><B><A HREF="../../../../../org/apache/hadoop/examples/terasort/TeraValidate.html" title="class in org.apache.hadoop.examples.terasort">TeraValidate</A></B></TD> |
---|
125 | <TD>Generate 1 mapper per a file that checks to make sure the keys |
---|
126 | are sorted within each file.</TD> |
---|
127 | </TR> |
---|
128 | </TABLE> |
---|
129 | |
---|
130 | |
---|
131 | <P> |
---|
132 | <A NAME="package_description"><!-- --></A><H2> |
---|
133 | Package org.apache.hadoop.examples.terasort Description |
---|
134 | </H2> |
---|
135 | |
---|
136 | <P> |
---|
137 | This package consists of 3 map/reduce applications for Hadoop to |
---|
138 | compete in the annual <a |
---|
139 | href="http://www.hpl.hp.com/hosted/sortbenchmark" target="_top">terabyte sort</a> |
---|
140 | competition. |
---|
141 | |
---|
142 | <ul> |
---|
143 | <li><b>TeraGen</b> is a map/reduce program to generate the data. |
---|
144 | <li><b>TeraSort</b> samples the input data and uses map/reduce to |
---|
145 | sort the data into a total order. |
---|
146 | <li><b>TeraValidate</b> is a map/reduce program that validates the |
---|
147 | output is sorted. |
---|
148 | </ul> |
---|
149 | |
---|
150 | <p> |
---|
151 | |
---|
152 | <b>TeraGen</b> generates output data that is byte for byte |
---|
153 | equivalent to the C version including the newlines and specific |
---|
154 | keys. It divides the desired number of rows by the desired number of |
---|
155 | tasks and assigns ranges of rows to each map. The map jumps the random |
---|
156 | number generator to the correct value for the first row and generates |
---|
157 | the following rows. |
---|
158 | |
---|
159 | <p> |
---|
160 | |
---|
161 | <b>TeraSort</b> is a standard map/reduce sort, except for a custom |
---|
162 | partitioner that uses a sorted list of <i>N-1</i> sampled keys that define |
---|
163 | the key range for each reduce. In particular, all keys such that |
---|
164 | <i>sample[i-1] <= key < sample[i]</i> are sent to reduce |
---|
165 | <i>i</i>. This guarantees that the output of reduce <i>i</i> are all |
---|
166 | less than the output of reduce <i>i+1</i>. To speed up the |
---|
167 | partitioning, the partitioner builds a two level trie that quickly |
---|
168 | indexes into the list of sample keys based on the first two bytes of |
---|
169 | the key. TeraSort generates the sample keys by sampling the input |
---|
170 | before the job is submitted and writing the list of keys into HDFS. |
---|
171 | The input and output format, which are used by all 3 applications, |
---|
172 | read and write the text files in the right format. The output of the |
---|
173 | reduce has replication set to 1, instead of the default 3, because the |
---|
174 | contest does not require the output data be replicated on to multiple |
---|
175 | nodes. |
---|
176 | |
---|
177 | <p> |
---|
178 | |
---|
179 | <b>TeraValidate</b> ensures that the output is globally sorted. It |
---|
180 | creates one map per a file in the output directory and each map ensures that |
---|
181 | each key is less than or equal to the previous one. The map also generates |
---|
182 | records with the first and last keys of the file and the reduce |
---|
183 | ensures that the first key of file <i>i</i> is greater that the last key of |
---|
184 | file <i>i-1</i>. Any problems are reported as output of the reduce with the |
---|
185 | keys that are out of order. |
---|
186 | |
---|
187 | <p> |
---|
188 | |
---|
189 | In May 2008, Owen O'Malley ran this code on a 910 node cluster and |
---|
190 | sorted the 10 billion records (1 TB) in 209 seconds (3.48 minutes) to |
---|
191 | win the annual general purpose (daytona) |
---|
192 | <a href="http://www.hpl.hp.com/hosted/sortbenchmark/">terabyte sort |
---|
193 | benchmark</a>. |
---|
194 | |
---|
195 | <p> |
---|
196 | |
---|
197 | The cluster statistics were: |
---|
198 | <ul> |
---|
199 | <li> 910 nodes |
---|
200 | <li> 4 dual core Xeons @ 2.0ghz per a node |
---|
201 | <li> 4 SATA disks per a node |
---|
202 | <li> 8G RAM per a node |
---|
203 | <li> 1 gigabit ethernet on each node |
---|
204 | <li> 40 nodes per a rack |
---|
205 | <li> 8 gigabit ethernet uplinks from each rack to the core |
---|
206 | <li> Red Hat Enterprise Linux Server Release 5.1 (kernel 2.6.18) |
---|
207 | <li> Sun Java JDK 1.6.0_05-b13 |
---|
208 | </ul> |
---|
209 | |
---|
210 | <p> |
---|
211 | |
---|
212 | The test was on Hadoop trunk (pre-0.18) patched with <a |
---|
213 | href="http://issues.apache.org/jira/browse/HADOOP-3443">HADOOP-3443</a> |
---|
214 | and <a |
---|
215 | href="http://issues.apache.org/jira/browse/HADOOP-3446">HADOOP-3446</a>, |
---|
216 | which were required to remove intermediate writes to disk. |
---|
217 | TeraGen used |
---|
218 | 1800 tasks to generate a total of 10 billion rows in HDFS, with a |
---|
219 | block size of 1024 MB. |
---|
220 | TeraSort was configured with 1800 maps and 1800 reduces, and |
---|
221 | <i>io.sort.mb</i>, |
---|
222 | <i>io.sort.factor</i>, <i>fs.inmemory.size.mb</i>, and task heap size |
---|
223 | sufficient that transient data was never spilled to disk, other at the |
---|
224 | end of the map. The sampler looked at 100,000 keys to determine the |
---|
225 | reduce boundaries, which lead to imperfect balancing with reduce |
---|
226 | outputs ranging from 337 MB to 872 MB. |
---|
227 | <P> |
---|
228 | |
---|
229 | <P> |
---|
230 | <DL> |
---|
231 | </DL> |
---|
232 | <HR> |
---|
233 | |
---|
234 | |
---|
235 | <!-- ======= START OF BOTTOM NAVBAR ====== --> |
---|
236 | <A NAME="navbar_bottom"><!-- --></A> |
---|
237 | <A HREF="#skip-navbar_bottom" title="Skip navigation links"></A> |
---|
238 | <TABLE BORDER="0" WIDTH="100%" CELLPADDING="1" CELLSPACING="0" SUMMARY=""> |
---|
239 | <TR> |
---|
240 | <TD COLSPAN=2 BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> |
---|
241 | <A NAME="navbar_bottom_firstrow"><!-- --></A> |
---|
242 | <TABLE BORDER="0" CELLPADDING="0" CELLSPACING="3" SUMMARY=""> |
---|
243 | <TR ALIGN="center" VALIGN="top"> |
---|
244 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../../overview-summary.html"><FONT CLASS="NavBarFont1"><B>Overview</B></FONT></A> </TD> |
---|
245 | <TD BGCOLOR="#FFFFFF" CLASS="NavBarCell1Rev"> <FONT CLASS="NavBarFont1Rev"><B>Package</B></FONT> </TD> |
---|
246 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <FONT CLASS="NavBarFont1">Class</FONT> </TD> |
---|
247 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="package-use.html"><FONT CLASS="NavBarFont1"><B>Use</B></FONT></A> </TD> |
---|
248 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="package-tree.html"><FONT CLASS="NavBarFont1"><B>Tree</B></FONT></A> </TD> |
---|
249 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../../deprecated-list.html"><FONT CLASS="NavBarFont1"><B>Deprecated</B></FONT></A> </TD> |
---|
250 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../../index-all.html"><FONT CLASS="NavBarFont1"><B>Index</B></FONT></A> </TD> |
---|
251 | <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../../help-doc.html"><FONT CLASS="NavBarFont1"><B>Help</B></FONT></A> </TD> |
---|
252 | </TR> |
---|
253 | </TABLE> |
---|
254 | </TD> |
---|
255 | <TD ALIGN="right" VALIGN="top" ROWSPAN=3><EM> |
---|
256 | </EM> |
---|
257 | </TD> |
---|
258 | </TR> |
---|
259 | |
---|
260 | <TR> |
---|
261 | <TD BGCOLOR="white" CLASS="NavBarCell2"><FONT SIZE="-2"> |
---|
262 | <A HREF="../../../../../org/apache/hadoop/examples/dancing/package-summary.html"><B>PREV PACKAGE</B></A> |
---|
263 | <A HREF="../../../../../org/apache/hadoop/filecache/package-summary.html"><B>NEXT PACKAGE</B></A></FONT></TD> |
---|
264 | <TD BGCOLOR="white" CLASS="NavBarCell2"><FONT SIZE="-2"> |
---|
265 | <A HREF="../../../../../index.html?org/apache/hadoop/examples/terasort/package-summary.html" target="_top"><B>FRAMES</B></A> |
---|
266 | <A HREF="package-summary.html" target="_top"><B>NO FRAMES</B></A> |
---|
267 | <SCRIPT type="text/javascript"> |
---|
268 | <!-- |
---|
269 | if(window==top) { |
---|
270 | document.writeln('<A HREF="../../../../../allclasses-noframe.html"><B>All Classes</B></A>'); |
---|
271 | } |
---|
272 | //--> |
---|
273 | </SCRIPT> |
---|
274 | <NOSCRIPT> |
---|
275 | <A HREF="../../../../../allclasses-noframe.html"><B>All Classes</B></A> |
---|
276 | </NOSCRIPT> |
---|
277 | |
---|
278 | |
---|
279 | </FONT></TD> |
---|
280 | </TR> |
---|
281 | </TABLE> |
---|
282 | <A NAME="skip-navbar_bottom"></A> |
---|
283 | <!-- ======== END OF BOTTOM NAVBAR ======= --> |
---|
284 | |
---|
285 | <HR> |
---|
286 | Copyright © 2009 The Apache Software Foundation |
---|
287 | </BODY> |
---|
288 | </HTML> |
---|