source: proiecte/HadoopJUnit/hadoop-0.20.1/docs/distcp.html @ 120

Last change on this file since 120 was 120, checked in by (none), 14 years ago

Added the mail files for the Hadoop JUNit Project

  • Property svn:executable set to *
File size: 24.0 KB
Line 
1<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
2<html>
3<head>
4<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
5<meta content="Apache Forrest" name="Generator">
6<meta name="Forrest-version" content="0.8">
7<meta name="Forrest-skin-name" content="pelt">
8<title>DistCp Guide</title>
9<link type="text/css" href="skin/basic.css" rel="stylesheet">
10<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
11<link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
12<link type="text/css" href="skin/profile.css" rel="stylesheet">
13<script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
14<link rel="shortcut icon" href="images/favicon.ico">
15</head>
16<body onload="init()">
17<script type="text/javascript">ndeSetTextSize();</script>
18<div id="top">
19<!--+
20    |breadtrail
21    +-->
22<div class="breadtrail">
23<a href="http://www.apache.org/">Apache</a> &gt; <a href="http://hadoop.apache.org/">Hadoop</a> &gt; <a href="http://hadoop.apache.org/core/">Core</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
24</div>
25<!--+
26    |header
27    +-->
28<div class="header">
29<!--+
30    |start group logo
31    +-->
32<div class="grouplogo">
33<a href="http://hadoop.apache.org/"><img class="logoImage" alt="Hadoop" src="images/hadoop-logo.jpg" title="Apache Hadoop"></a>
34</div>
35<!--+
36    |end group logo
37    +-->
38<!--+
39    |start Project Logo
40    +-->
41<div class="projectlogo">
42<a href="http://hadoop.apache.org/core/"><img class="logoImage" alt="Hadoop" src="images/core-logo.gif" title="Scalable Computing Platform"></a>
43</div>
44<!--+
45    |end Project Logo
46    +-->
47<!--+
48    |start Search
49    +-->
50<div class="searchbox">
51<form action="http://www.google.com/search" method="get" class="roundtopsmall">
52<input value="hadoop.apache.org" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google">&nbsp; 
53                    <input name="Search" value="Search" type="submit">
54</form>
55</div>
56<!--+
57    |end search
58    +-->
59<!--+
60    |start Tabs
61    +-->
62<ul id="tabs">
63<li>
64<a class="unselected" href="http://hadoop.apache.org/core/">Project</a>
65</li>
66<li>
67<a class="unselected" href="http://wiki.apache.org/hadoop">Wiki</a>
68</li>
69<li class="current">
70<a class="selected" href="index.html">Hadoop 0.20 Documentation</a>
71</li>
72</ul>
73<!--+
74    |end Tabs
75    +-->
76</div>
77</div>
78<div id="main">
79<div id="publishedStrip">
80<!--+
81    |start Subtabs
82    +-->
83<div id="level2tabs"></div>
84<!--+
85    |end Endtabs
86    +-->
87<script type="text/javascript"><!--
88document.write("Last Published: " + document.lastModified);
89//  --></script>
90</div>
91<!--+
92    |breadtrail
93    +-->
94<div class="breadtrail">
95
96             &nbsp;
97           </div>
98<!--+
99    |start Menu, mainarea
100    +-->
101<!--+
102    |start Menu
103    +-->
104<div id="menu">
105<div onclick="SwitchMenu('menu_1.1', 'skin/')" id="menu_1.1Title" class="menutitle">Getting Started</div>
106<div id="menu_1.1" class="menuitemgroup">
107<div class="menuitem">
108<a href="index.html">Overview</a>
109</div>
110<div class="menuitem">
111<a href="quickstart.html">Quick Start</a>
112</div>
113<div class="menuitem">
114<a href="cluster_setup.html">Cluster Setup</a>
115</div>
116<div class="menuitem">
117<a href="mapred_tutorial.html">Map/Reduce Tutorial</a>
118</div>
119</div>
120<div onclick="SwitchMenu('menu_selected_1.2', 'skin/')" id="menu_selected_1.2Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">Programming Guides</div>
121<div id="menu_selected_1.2" class="selectedmenuitemgroup" style="display: block;">
122<div class="menuitem">
123<a href="commands_manual.html">Commands</a>
124</div>
125<div class="menupage">
126<div class="menupagetitle">DistCp</div>
127</div>
128<div class="menuitem">
129<a href="native_libraries.html">Native Libraries</a>
130</div>
131<div class="menuitem">
132<a href="streaming.html">Streaming</a>
133</div>
134<div class="menuitem">
135<a href="fair_scheduler.html">Fair Scheduler</a>
136</div>
137<div class="menuitem">
138<a href="capacity_scheduler.html">Capacity Scheduler</a>
139</div>
140<div class="menuitem">
141<a href="service_level_auth.html">Service Level Authorization</a>
142</div>
143<div class="menuitem">
144<a href="vaidya.html">Vaidya</a>
145</div>
146<div class="menuitem">
147<a href="hadoop_archives.html">Archives</a>
148</div>
149</div>
150<div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle">HDFS</div>
151<div id="menu_1.3" class="menuitemgroup">
152<div class="menuitem">
153<a href="hdfs_user_guide.html">User Guide</a>
154</div>
155<div class="menuitem">
156<a href="hdfs_design.html">Architecture</a>
157</div>
158<div class="menuitem">
159<a href="hdfs_shell.html">File System Shell Guide</a>
160</div>
161<div class="menuitem">
162<a href="hdfs_permissions_guide.html">Permissions Guide</a>
163</div>
164<div class="menuitem">
165<a href="hdfs_quota_admin_guide.html">Quotas Guide</a>
166</div>
167<div class="menuitem">
168<a href="SLG_user_guide.html">Synthetic Load Generator Guide</a>
169</div>
170<div class="menuitem">
171<a href="libhdfs.html">C API libhdfs</a>
172</div>
173</div>
174<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">HOD</div>
175<div id="menu_1.4" class="menuitemgroup">
176<div class="menuitem">
177<a href="hod_user_guide.html">User Guide</a>
178</div>
179<div class="menuitem">
180<a href="hod_admin_guide.html">Admin Guide</a>
181</div>
182<div class="menuitem">
183<a href="hod_config_guide.html">Config Guide</a>
184</div>
185</div>
186<div onclick="SwitchMenu('menu_1.5', 'skin/')" id="menu_1.5Title" class="menutitle">Miscellaneous</div>
187<div id="menu_1.5" class="menuitemgroup">
188<div class="menuitem">
189<a href="api/index.html">API Docs</a>
190</div>
191<div class="menuitem">
192<a href="jdiff/changes.html">API Changes</a>
193</div>
194<div class="menuitem">
195<a href="http://wiki.apache.org/hadoop/">Wiki</a>
196</div>
197<div class="menuitem">
198<a href="http://wiki.apache.org/hadoop/FAQ">FAQ</a>
199</div>
200<div class="menuitem">
201<a href="releasenotes.html">Release Notes</a>
202</div>
203<div class="menuitem">
204<a href="changes.html">Change Log</a>
205</div>
206</div>
207<div id="credit"></div>
208<div id="roundbottom">
209<img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
210<!--+
211  |alternative credits
212  +-->
213<div id="credit2"></div>
214</div>
215<!--+
216    |end Menu
217    +-->
218<!--+
219    |start content
220    +-->
221<div id="content">
222<div title="Portable Document Format" class="pdflink">
223<a class="dida" href="distcp.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
224        PDF</a>
225</div>
226<h1>DistCp Guide</h1>
227<div id="minitoc-area">
228<ul class="minitoc">
229<li>
230<a href="#Overview">Overview</a>
231</li>
232<li>
233<a href="#Usage">Usage</a>
234<ul class="minitoc">
235<li>
236<a href="#Basic">Basic</a>
237</li>
238<li>
239<a href="#options">Options</a>
240<ul class="minitoc">
241<li>
242<a href="#Option+Index">Option Index</a>
243</li>
244<li>
245<a href="#Symbolic-Representations">Symbolic Representations</a>
246</li>
247<li>
248<a href="#uo">Update and Overwrite</a>
249</li>
250</ul>
251</li>
252</ul>
253</li>
254<li>
255<a href="#etc">Appendix</a>
256<ul class="minitoc">
257<li>
258<a href="#Map+sizing">Map sizing</a>
259</li>
260<li>
261<a href="#cpver">Copying between versions of HDFS</a>
262</li>
263<li>
264<a href="#Map%2FReduce+and+other+side-effects">Map/Reduce and other side-effects</a>
265</li>
266</ul>
267</li>
268</ul>
269</div>
270
271   
272<a name="N1000D"></a><a name="Overview"></a>
273<h2 class="h3">Overview</h2>
274<div class="section">
275<p>DistCp (distributed copy) is a tool used for large inter/intra-cluster
276      copying. It uses Map/Reduce to effect its distribution, error
277      handling and recovery, and reporting. It expands a list of files and
278      directories into input to map tasks, each of which will copy a partition
279      of the files specified in the source list. Its Map/Reduce pedigree has
280      endowed it with some quirks in both its semantics and execution. The
281      purpose of this document is to offer guidance for common tasks and to
282      elucidate its model.</p>
283</div>
284
285   
286<a name="N10017"></a><a name="Usage"></a>
287<h2 class="h3">Usage</h2>
288<div class="section">
289<a name="N1001D"></a><a name="Basic"></a>
290<h3 class="h4">Basic</h3>
291<p>The most common invocation of DistCp is an inter-cluster copy:</p>
292<p>
293<span class="codefrag">bash$ hadoop distcp hdfs://nn1:8020/foo/bar \</span>
294<br>
295           
296<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
297                 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
298                 hdfs://nn2:8020/bar/foo</span>
299</p>
300<p>This will expand the namespace under <span class="codefrag">/foo/bar</span> on nn1
301        into a temporary file, partition its contents among a set of map
302        tasks, and start a copy on each TaskTracker from nn1 to nn2. Note
303        that DistCp expects absolute paths.</p>
304<p>One can also specify multiple source directories on the command
305        line:</p>
306<p>
307<span class="codefrag">bash$ hadoop distcp hdfs://nn1:8020/foo/a \</span>
308<br>
309           
310<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
311                 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
312                 hdfs://nn1:8020/foo/b \</span>
313<br>
314           
315<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
316                 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
317                 hdfs://nn2:8020/bar/foo</span>
318</p>
319<p>Or, equivalently, from a file using the <span class="codefrag">-f</span> option:<br>
320       
321<span class="codefrag">bash$ hadoop distcp -f hdfs://nn1:8020/srclist \</span>
322<br>
323       
324<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
325              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
326              &nbsp;hdfs://nn2:8020/bar/foo</span>
327<br>
328</p>
329<p>Where <span class="codefrag">srclist</span> contains<br>
330       
331<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a</span>
332<br>
333       
334<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b</span>
335</p>
336<p>When copying from multiple sources, DistCp will abort the copy with
337        an error message if two sources collide, but collisions at the
338        destination are resolved per the <a href="#options">options</a>
339        specified. By default, files already existing at the destination are
340        skipped (i.e. not replaced by the source file). A count of skipped
341        files is reported at the end of each job, but it may be inaccurate if a
342        copier failed for some subset of its files, but succeeded on a later
343        attempt (see <a href="#etc">Appendix</a>).</p>
344<p>It is important that each TaskTracker can reach and communicate with
345        both the source and destination file systems. For HDFS, both the source
346        and destination must be running the same version of the protocol or use
347        a backwards-compatible protocol (see <a href="#cpver">Copying Between
348        Versions</a>).</p>
349<p>After a copy, it is recommended that one generates and cross-checks
350        a listing of the source and destination to verify that the copy was
351        truly successful. Since DistCp employs both Map/Reduce and the
352        FileSystem API, issues in or between any of the three could adversely
353        and silently affect the copy. Some have had success running with
354        <span class="codefrag">-update</span> enabled to perform a second pass, but users should
355        be acquainted with its semantics before attempting this.</p>
356<p>It's also worth noting that if another client is still writing to a
357        source file, the copy will likely fail. Attempting to overwrite a file
358        being written at the destination should also fail on HDFS. If a source
359        file is (re)moved before it is copied, the copy will fail with a
360        FileNotFoundException.</p>
361<a name="N1007E"></a><a name="options"></a>
362<h3 class="h4">Options</h3>
363<a name="N10084"></a><a name="Option+Index"></a>
364<h4>Option Index</h4>
365<table class="ForrestTable" cellspacing="1" cellpadding="4">
366         
367<tr>
368<th colspan="1" rowspan="1"> Flag </th><th colspan="1" rowspan="1"> Description </th><th colspan="1" rowspan="1"> Notes </th>
369</tr>
370
371         
372<tr>
373<td colspan="1" rowspan="1"><span class="codefrag">-p[rbugp]</span></td>
374              <td colspan="1" rowspan="1">Preserve<br>
375                  &nbsp;&nbsp;r: replication number<br>
376                  &nbsp;&nbsp;b: block size<br>
377                  &nbsp;&nbsp;u: user<br>
378                  &nbsp;&nbsp;g: group<br>
379                  &nbsp;&nbsp;p: permission<br>
380</td>
381              <td colspan="1" rowspan="1">Modification times are not preserved. Also, when
382              <span class="codefrag">-update</span> is specified, status updates will
383              <strong>not</strong> be synchronized unless the file sizes
384              also differ (i.e. unless the file is re-created).
385              </td>
386</tr>
387         
388<tr>
389<td colspan="1" rowspan="1"><span class="codefrag">-i</span></td>
390              <td colspan="1" rowspan="1">Ignore failures</td>
391              <td colspan="1" rowspan="1">As explained in the <a href="#etc">Appendix</a>, this option
392              will keep more accurate statistics about the copy than the
393              default case. It also preserves logs from failed copies, which
394              can be valuable for debugging. Finally, a failing map will not
395              cause the job to fail before all splits are attempted.
396              </td>
397</tr>
398         
399<tr>
400<td colspan="1" rowspan="1"><span class="codefrag">-log &lt;logdir&gt;</span></td>
401              <td colspan="1" rowspan="1">Write logs to &lt;logdir&gt;</td>
402              <td colspan="1" rowspan="1">DistCp keeps logs of each file it attempts to copy as map
403              output. If a map fails, the log output will not be retained if
404              it is re-executed.
405              </td>
406</tr>
407         
408<tr>
409<td colspan="1" rowspan="1"><span class="codefrag">-m &lt;num_maps&gt;</span></td>
410              <td colspan="1" rowspan="1">Maximum number of simultaneous copies</td>
411              <td colspan="1" rowspan="1">Specify the number of maps to copy data. Note that more maps
412              may not necessarily improve throughput.
413              </td>
414</tr>
415         
416<tr>
417<td colspan="1" rowspan="1"><span class="codefrag">-overwrite</span></td>
418              <td colspan="1" rowspan="1">Overwrite destination</td>
419              <td colspan="1" rowspan="1">If a map fails and <span class="codefrag">-i</span> is not specified, all the
420              files in the split, not only those that failed, will be recopied.
421              As discussed in the <a href="#uo">following</a>, it also changes
422              the semantics for generating destination paths, so users should
423              use this carefully.
424              </td>
425</tr>
426         
427<tr>
428<td colspan="1" rowspan="1"><span class="codefrag">-update</span></td>
429              <td colspan="1" rowspan="1">Overwrite if src size different from dst size</td>
430              <td colspan="1" rowspan="1">As noted in the preceding, this is not a "sync"
431              operation. The only criterion examined is the source and
432              destination file sizes; if they differ, the source file
433              replaces the destination file. As discussed in the
434              <a href="#uo">following</a>, it also changes the semantics for
435              generating destination paths, so users should use this carefully.
436              </td>
437</tr>
438         
439<tr>
440<td colspan="1" rowspan="1"><span class="codefrag">-f &lt;urilist_uri&gt;</span></td>
441              <td colspan="1" rowspan="1">Use list at &lt;urilist_uri&gt; as src list</td>
442              <td colspan="1" rowspan="1">This is equivalent to listing each source on the command
443              line. The <span class="codefrag">urilist_uri</span> list should be a fully
444              qualified URI.
445              </td>
446</tr>
447         
448<tr>
449<td colspan="1" rowspan="1"><span class="codefrag">-filelimit &lt;n&gt;</span></td>
450              <td colspan="1" rowspan="1">Limit the total number of files to be &lt;= n</td>
451              <td colspan="1" rowspan="1">See also <a href="#Symbolic-Representations">Symbolic
452                  Representations</a>.
453              </td>
454</tr>
455         
456<tr>
457<td colspan="1" rowspan="1"><span class="codefrag">-sizelimit &lt;n&gt;</span></td>
458              <td colspan="1" rowspan="1">Limit the total size to be &lt;= n bytes</td>
459              <td colspan="1" rowspan="1">See also <a href="#Symbolic-Representations">Symbolic
460                  Representations</a>.
461              </td>
462</tr>
463         
464<tr>
465<td colspan="1" rowspan="1"><span class="codefrag">-delete</span></td>
466              <td colspan="1" rowspan="1">Delete the files existing in the dst but not in src</td>
467              <td colspan="1" rowspan="1">The deletion is done by FS Shell.  So the trash will be used,
468                  if it is enable.
469              </td>
470</tr>
471
472       
473</table>
474<a name="N10171"></a><a name="Symbolic-Representations"></a>
475<h4>Symbolic Representations</h4>
476<p>
477        The parameter &lt;n&gt; in <span class="codefrag">-filelimit</span>
478        and <span class="codefrag">-sizelimit</span> can be specified with symbolic
479        representation.  For examples,
480        </p>
481<ul>
482         
483<li>1230k = 1230 * 1024 = 1259520</li>
484         
485<li>891g = 891 * 1024^3 = 956703965184</li>
486       
487</ul>
488<a name="N1018A"></a><a name="uo"></a>
489<h4>Update and Overwrite</h4>
490<p>It's worth giving some examples of <span class="codefrag">-update</span> and
491        <span class="codefrag">-overwrite</span>. Consider a copy from <span class="codefrag">/foo/a</span> and
492        <span class="codefrag">/foo/b</span> to <span class="codefrag">/bar/foo</span>, where the sources contain
493        the following:</p>
494<p>
495<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a</span>
496<br>
497       
498<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a/aa</span>
499<br>
500       
501<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a/ab</span>
502<br>
503       
504<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b</span>
505<br>
506       
507<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b/ba</span>
508<br>
509       
510<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b/ab</span>
511</p>
512<p>If either <span class="codefrag">-update</span> or <span class="codefrag">-overwrite</span> is set,
513        then both sources will map an entry to <span class="codefrag">/bar/foo/ab</span> at the
514        destination. For both options, the contents of each source directory
515        are compared with the <strong>contents</strong> of the destination
516        directory. Rather than permit this conflict, DistCp will abort.</p>
517<p>In the default case, both <span class="codefrag">/bar/foo/a</span> and
518        <span class="codefrag">/bar/foo/b</span> will be created and neither will collide.</p>
519<p>Now consider a legal copy using <span class="codefrag">-update</span>:<br>
520       
521<span class="codefrag">distcp -update hdfs://nn1:8020/foo/a \</span>
522<br>
523       
524<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
525              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
526              hdfs://nn1:8020/foo/b \</span>
527<br>
528       
529<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
530              &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
531              hdfs://nn2:8020/bar</span>
532</p>
533<p>With sources/sizes:</p>
534<p>
535<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a</span>
536<br>
537       
538<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a/aa 32</span>
539<br>
540       
541<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/a/ab 32</span>
542<br>
543       
544<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b</span>
545<br>
546       
547<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b/ba 64</span>
548<br>
549       
550<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn1:8020/foo/b/bb 32</span>
551</p>
552<p>And destination/sizes:</p>
553<p>
554<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar</span>
555<br>
556       
557<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/aa 32</span>
558<br>
559       
560<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/ba 32</span>
561<br>
562       
563<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/bb 64</span>
564</p>
565<p>Will effect:</p>
566<p>
567<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar</span>
568<br>
569       
570<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/aa 32</span>
571<br>
572       
573<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/ab 32</span>
574<br>
575       
576<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/ba 64</span>
577<br>
578       
579<span class="codefrag">&nbsp;&nbsp;&nbsp;&nbsp;hdfs://nn2:8020/bar/bb 32</span>
580</p>
581<p>Only <span class="codefrag">aa</span> is not overwritten on nn2. If
582        <span class="codefrag">-overwrite</span> were specified, all elements would be
583        overwritten.</p>
584</div> <!-- Usage -->
585
586   
587<a name="N1023B"></a><a name="etc"></a>
588<h2 class="h3">Appendix</h2>
589<div class="section">
590<a name="N10241"></a><a name="Map+sizing"></a>
591<h3 class="h4">Map sizing</h3>
592<p>DistCp makes a faint attempt to size each map comparably so that
593          each copies roughly the same number of bytes. Note that files are the
594          finest level of granularity, so increasing the number of simultaneous
595          copiers (i.e. maps) may not always increase the number of
596          simultaneous copies nor the overall throughput.</p>
597<p>If <span class="codefrag">-m</span> is not specified, DistCp will attempt to
598          schedule work for <span class="codefrag">min (total_bytes / bytes.per.map, 20 *
599          num_task_trackers)</span> where <span class="codefrag">bytes.per.map</span> defaults
600          to 256MB.</p>
601<p>Tuning the number of maps to the size of the source and
602          destination clusters, the size of the copy, and the available
603          bandwidth is recommended for long-running and regularly run jobs.</p>
604<a name="N1025A"></a><a name="cpver"></a>
605<h3 class="h4">Copying between versions of HDFS</h3>
606<p>For copying between two different versions of Hadoop, one will
607        usually use HftpFileSystem. This is a read-only FileSystem, so DistCp
608        must be run on the destination cluster (more specifically, on
609        TaskTrackers that can write to the destination cluster). Each source is
610        specified as <span class="codefrag">hftp://&lt;dfs.http.address&gt;/&lt;path&gt;</span>
611        (the default <span class="codefrag">dfs.http.address</span> is
612        &lt;namenode&gt;:50070).</p>
613<a name="N1026A"></a><a name="Map%2FReduce+and+other+side-effects"></a>
614<h3 class="h4">Map/Reduce and other side-effects</h3>
615<p>As has been mentioned in the preceding, should a map fail to copy
616        one of its inputs, there will be several side-effects.</p>
617<ul>
618
619         
620<li>Unless <span class="codefrag">-i</span> is specified, the logs generated by that
621          task attempt will be replaced by the previous attempt.</li>
622
623         
624<li>Unless <span class="codefrag">-overwrite</span> is specified, files successfully
625          copied by a previous map on a re-execution will be marked as
626          "skipped".</li>
627
628         
629<li>If a map fails <span class="codefrag">mapred.map.max.attempts</span> times, the
630          remaining map tasks will be killed (unless <span class="codefrag">-i</span> is
631          set).</li>
632
633         
634<li>If <span class="codefrag">mapred.speculative.execution</span> is set set
635          <span class="codefrag">final</span> and <span class="codefrag">true</span>, the result of the copy is
636          undefined.</li>
637
638       
639</ul>
640</div> <!-- Appendix -->
641
642 
643</div>
644<!--+
645    |end content
646    +-->
647<div class="clearboth">&nbsp;</div>
648</div>
649<div id="footer">
650<!--+
651    |start bottomstrip
652    +-->
653<div class="lastmodified">
654<script type="text/javascript"><!--
655document.write("Last Published: " + document.lastModified);
656//  --></script>
657</div>
658<div class="copyright">
659        Copyright &copy;
660         2008 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
661</div>
662<!--+
663    |end bottomstrip
664    +-->
665</div>
666</body>
667</html>
Note: See TracBrowser for help on using the repository browser.