[120] | 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> |
---|
| 2 | <html> |
---|
| 3 | <head> |
---|
| 4 | <META http-equiv="Content-Type" content="text/html; charset=UTF-8"> |
---|
| 5 | <meta content="Apache Forrest" name="Generator"> |
---|
| 6 | <meta name="Forrest-version" content="0.8"> |
---|
| 7 | <meta name="Forrest-skin-name" content="pelt"> |
---|
| 8 | <title> |
---|
| 9 | HDFS User Guide |
---|
| 10 | </title> |
---|
| 11 | <link type="text/css" href="skin/basic.css" rel="stylesheet"> |
---|
| 12 | <link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet"> |
---|
| 13 | <link media="print" type="text/css" href="skin/print.css" rel="stylesheet"> |
---|
| 14 | <link type="text/css" href="skin/profile.css" rel="stylesheet"> |
---|
| 15 | <script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script> |
---|
| 16 | <link rel="shortcut icon" href="images/favicon.ico"> |
---|
| 17 | </head> |
---|
| 18 | <body onload="init()"> |
---|
| 19 | <script type="text/javascript">ndeSetTextSize();</script> |
---|
| 20 | <div id="top"> |
---|
| 21 | <!--+ |
---|
| 22 | |breadtrail |
---|
| 23 | +--> |
---|
| 24 | <div class="breadtrail"> |
---|
| 25 | <a href="http://www.apache.org/">Apache</a> > <a href="http://hadoop.apache.org/">Hadoop</a> > <a href="http://hadoop.apache.org/core/">Core</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script> |
---|
| 26 | </div> |
---|
| 27 | <!--+ |
---|
| 28 | |header |
---|
| 29 | +--> |
---|
| 30 | <div class="header"> |
---|
| 31 | <!--+ |
---|
| 32 | |start group logo |
---|
| 33 | +--> |
---|
| 34 | <div class="grouplogo"> |
---|
| 35 | <a href="http://hadoop.apache.org/"><img class="logoImage" alt="Hadoop" src="images/hadoop-logo.jpg" title="Apache Hadoop"></a> |
---|
| 36 | </div> |
---|
| 37 | <!--+ |
---|
| 38 | |end group logo |
---|
| 39 | +--> |
---|
| 40 | <!--+ |
---|
| 41 | |start Project Logo |
---|
| 42 | +--> |
---|
| 43 | <div class="projectlogo"> |
---|
| 44 | <a href="http://hadoop.apache.org/core/"><img class="logoImage" alt="Hadoop" src="images/core-logo.gif" title="Scalable Computing Platform"></a> |
---|
| 45 | </div> |
---|
| 46 | <!--+ |
---|
| 47 | |end Project Logo |
---|
| 48 | +--> |
---|
| 49 | <!--+ |
---|
| 50 | |start Search |
---|
| 51 | +--> |
---|
| 52 | <div class="searchbox"> |
---|
| 53 | <form action="http://www.google.com/search" method="get" class="roundtopsmall"> |
---|
| 54 | <input value="hadoop.apache.org" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google"> |
---|
| 55 | <input name="Search" value="Search" type="submit"> |
---|
| 56 | </form> |
---|
| 57 | </div> |
---|
| 58 | <!--+ |
---|
| 59 | |end search |
---|
| 60 | +--> |
---|
| 61 | <!--+ |
---|
| 62 | |start Tabs |
---|
| 63 | +--> |
---|
| 64 | <ul id="tabs"> |
---|
| 65 | <li> |
---|
| 66 | <a class="unselected" href="http://hadoop.apache.org/core/">Project</a> |
---|
| 67 | </li> |
---|
| 68 | <li> |
---|
| 69 | <a class="unselected" href="http://wiki.apache.org/hadoop">Wiki</a> |
---|
| 70 | </li> |
---|
| 71 | <li class="current"> |
---|
| 72 | <a class="selected" href="index.html">Hadoop 0.20 Documentation</a> |
---|
| 73 | </li> |
---|
| 74 | </ul> |
---|
| 75 | <!--+ |
---|
| 76 | |end Tabs |
---|
| 77 | +--> |
---|
| 78 | </div> |
---|
| 79 | </div> |
---|
| 80 | <div id="main"> |
---|
| 81 | <div id="publishedStrip"> |
---|
| 82 | <!--+ |
---|
| 83 | |start Subtabs |
---|
| 84 | +--> |
---|
| 85 | <div id="level2tabs"></div> |
---|
| 86 | <!--+ |
---|
| 87 | |end Endtabs |
---|
| 88 | +--> |
---|
| 89 | <script type="text/javascript"><!-- |
---|
| 90 | document.write("Last Published: " + document.lastModified); |
---|
| 91 | // --></script> |
---|
| 92 | </div> |
---|
| 93 | <!--+ |
---|
| 94 | |breadtrail |
---|
| 95 | +--> |
---|
| 96 | <div class="breadtrail"> |
---|
| 97 | |
---|
| 98 | |
---|
| 99 | </div> |
---|
| 100 | <!--+ |
---|
| 101 | |start Menu, mainarea |
---|
| 102 | +--> |
---|
| 103 | <!--+ |
---|
| 104 | |start Menu |
---|
| 105 | +--> |
---|
| 106 | <div id="menu"> |
---|
| 107 | <div onclick="SwitchMenu('menu_1.1', 'skin/')" id="menu_1.1Title" class="menutitle">Getting Started</div> |
---|
| 108 | <div id="menu_1.1" class="menuitemgroup"> |
---|
| 109 | <div class="menuitem"> |
---|
| 110 | <a href="index.html">Overview</a> |
---|
| 111 | </div> |
---|
| 112 | <div class="menuitem"> |
---|
| 113 | <a href="quickstart.html">Quick Start</a> |
---|
| 114 | </div> |
---|
| 115 | <div class="menuitem"> |
---|
| 116 | <a href="cluster_setup.html">Cluster Setup</a> |
---|
| 117 | </div> |
---|
| 118 | <div class="menuitem"> |
---|
| 119 | <a href="mapred_tutorial.html">Map/Reduce Tutorial</a> |
---|
| 120 | </div> |
---|
| 121 | </div> |
---|
| 122 | <div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle">Programming Guides</div> |
---|
| 123 | <div id="menu_1.2" class="menuitemgroup"> |
---|
| 124 | <div class="menuitem"> |
---|
| 125 | <a href="commands_manual.html">Commands</a> |
---|
| 126 | </div> |
---|
| 127 | <div class="menuitem"> |
---|
| 128 | <a href="distcp.html">DistCp</a> |
---|
| 129 | </div> |
---|
| 130 | <div class="menuitem"> |
---|
| 131 | <a href="native_libraries.html">Native Libraries</a> |
---|
| 132 | </div> |
---|
| 133 | <div class="menuitem"> |
---|
| 134 | <a href="streaming.html">Streaming</a> |
---|
| 135 | </div> |
---|
| 136 | <div class="menuitem"> |
---|
| 137 | <a href="fair_scheduler.html">Fair Scheduler</a> |
---|
| 138 | </div> |
---|
| 139 | <div class="menuitem"> |
---|
| 140 | <a href="capacity_scheduler.html">Capacity Scheduler</a> |
---|
| 141 | </div> |
---|
| 142 | <div class="menuitem"> |
---|
| 143 | <a href="service_level_auth.html">Service Level Authorization</a> |
---|
| 144 | </div> |
---|
| 145 | <div class="menuitem"> |
---|
| 146 | <a href="vaidya.html">Vaidya</a> |
---|
| 147 | </div> |
---|
| 148 | <div class="menuitem"> |
---|
| 149 | <a href="hadoop_archives.html">Archives</a> |
---|
| 150 | </div> |
---|
| 151 | </div> |
---|
| 152 | <div onclick="SwitchMenu('menu_selected_1.3', 'skin/')" id="menu_selected_1.3Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">HDFS</div> |
---|
| 153 | <div id="menu_selected_1.3" class="selectedmenuitemgroup" style="display: block;"> |
---|
| 154 | <div class="menupage"> |
---|
| 155 | <div class="menupagetitle">User Guide</div> |
---|
| 156 | </div> |
---|
| 157 | <div class="menuitem"> |
---|
| 158 | <a href="hdfs_design.html">Architecture</a> |
---|
| 159 | </div> |
---|
| 160 | <div class="menuitem"> |
---|
| 161 | <a href="hdfs_shell.html">File System Shell Guide</a> |
---|
| 162 | </div> |
---|
| 163 | <div class="menuitem"> |
---|
| 164 | <a href="hdfs_permissions_guide.html">Permissions Guide</a> |
---|
| 165 | </div> |
---|
| 166 | <div class="menuitem"> |
---|
| 167 | <a href="hdfs_quota_admin_guide.html">Quotas Guide</a> |
---|
| 168 | </div> |
---|
| 169 | <div class="menuitem"> |
---|
| 170 | <a href="SLG_user_guide.html">Synthetic Load Generator Guide</a> |
---|
| 171 | </div> |
---|
| 172 | <div class="menuitem"> |
---|
| 173 | <a href="libhdfs.html">C API libhdfs</a> |
---|
| 174 | </div> |
---|
| 175 | </div> |
---|
| 176 | <div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">HOD</div> |
---|
| 177 | <div id="menu_1.4" class="menuitemgroup"> |
---|
| 178 | <div class="menuitem"> |
---|
| 179 | <a href="hod_user_guide.html">User Guide</a> |
---|
| 180 | </div> |
---|
| 181 | <div class="menuitem"> |
---|
| 182 | <a href="hod_admin_guide.html">Admin Guide</a> |
---|
| 183 | </div> |
---|
| 184 | <div class="menuitem"> |
---|
| 185 | <a href="hod_config_guide.html">Config Guide</a> |
---|
| 186 | </div> |
---|
| 187 | </div> |
---|
| 188 | <div onclick="SwitchMenu('menu_1.5', 'skin/')" id="menu_1.5Title" class="menutitle">Miscellaneous</div> |
---|
| 189 | <div id="menu_1.5" class="menuitemgroup"> |
---|
| 190 | <div class="menuitem"> |
---|
| 191 | <a href="api/index.html">API Docs</a> |
---|
| 192 | </div> |
---|
| 193 | <div class="menuitem"> |
---|
| 194 | <a href="jdiff/changes.html">API Changes</a> |
---|
| 195 | </div> |
---|
| 196 | <div class="menuitem"> |
---|
| 197 | <a href="http://wiki.apache.org/hadoop/">Wiki</a> |
---|
| 198 | </div> |
---|
| 199 | <div class="menuitem"> |
---|
| 200 | <a href="http://wiki.apache.org/hadoop/FAQ">FAQ</a> |
---|
| 201 | </div> |
---|
| 202 | <div class="menuitem"> |
---|
| 203 | <a href="releasenotes.html">Release Notes</a> |
---|
| 204 | </div> |
---|
| 205 | <div class="menuitem"> |
---|
| 206 | <a href="changes.html">Change Log</a> |
---|
| 207 | </div> |
---|
| 208 | </div> |
---|
| 209 | <div id="credit"></div> |
---|
| 210 | <div id="roundbottom"> |
---|
| 211 | <img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div> |
---|
| 212 | <!--+ |
---|
| 213 | |alternative credits |
---|
| 214 | +--> |
---|
| 215 | <div id="credit2"></div> |
---|
| 216 | </div> |
---|
| 217 | <!--+ |
---|
| 218 | |end Menu |
---|
| 219 | +--> |
---|
| 220 | <!--+ |
---|
| 221 | |start content |
---|
| 222 | +--> |
---|
| 223 | <div id="content"> |
---|
| 224 | <div title="Portable Document Format" class="pdflink"> |
---|
| 225 | <a class="dida" href="hdfs_user_guide.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br> |
---|
| 226 | PDF</a> |
---|
| 227 | </div> |
---|
| 228 | <h1> |
---|
| 229 | HDFS User Guide |
---|
| 230 | </h1> |
---|
| 231 | <div id="minitoc-area"> |
---|
| 232 | <ul class="minitoc"> |
---|
| 233 | <li> |
---|
| 234 | <a href="#Purpose">Purpose</a> |
---|
| 235 | </li> |
---|
| 236 | <li> |
---|
| 237 | <a href="#Overview"> Overview </a> |
---|
| 238 | </li> |
---|
| 239 | <li> |
---|
| 240 | <a href="#Pre-requisites"> Pre-requisites </a> |
---|
| 241 | </li> |
---|
| 242 | <li> |
---|
| 243 | <a href="#Web+Interface"> Web Interface </a> |
---|
| 244 | </li> |
---|
| 245 | <li> |
---|
| 246 | <a href="#Shell+Commands">Shell Commands</a> |
---|
| 247 | <ul class="minitoc"> |
---|
| 248 | <li> |
---|
| 249 | <a href="#DFSAdmin+Command"> DFSAdmin Command </a> |
---|
| 250 | </li> |
---|
| 251 | </ul> |
---|
| 252 | </li> |
---|
| 253 | <li> |
---|
| 254 | <a href="#Secondary+NameNode"> Secondary NameNode </a> |
---|
| 255 | </li> |
---|
| 256 | <li> |
---|
| 257 | <a href="#Rebalancer"> Rebalancer </a> |
---|
| 258 | </li> |
---|
| 259 | <li> |
---|
| 260 | <a href="#Rack+Awareness"> Rack Awareness </a> |
---|
| 261 | </li> |
---|
| 262 | <li> |
---|
| 263 | <a href="#Safemode"> Safemode </a> |
---|
| 264 | </li> |
---|
| 265 | <li> |
---|
| 266 | <a href="#fsck"> fsck </a> |
---|
| 267 | </li> |
---|
| 268 | <li> |
---|
| 269 | <a href="#Upgrade+and+Rollback"> Upgrade and Rollback </a> |
---|
| 270 | </li> |
---|
| 271 | <li> |
---|
| 272 | <a href="#File+Permissions+and+Security"> File Permissions and Security </a> |
---|
| 273 | </li> |
---|
| 274 | <li> |
---|
| 275 | <a href="#Scalability"> Scalability </a> |
---|
| 276 | </li> |
---|
| 277 | <li> |
---|
| 278 | <a href="#Related+Documentation"> Related Documentation </a> |
---|
| 279 | </li> |
---|
| 280 | </ul> |
---|
| 281 | </div> |
---|
| 282 | |
---|
| 283 | <a name="N1000D"></a><a name="Purpose"></a> |
---|
| 284 | <h2 class="h3">Purpose</h2> |
---|
| 285 | <div class="section"> |
---|
| 286 | <p> |
---|
| 287 | This document is a starting point for users working with |
---|
| 288 | Hadoop Distributed File System (HDFS) either as a part of a |
---|
| 289 | <a href="http://hadoop.apache.org/">Hadoop</a> |
---|
| 290 | cluster or as a stand-alone general purpose distributed file system. |
---|
| 291 | While HDFS is designed to "just work" in many environments, a working |
---|
| 292 | knowledge of HDFS helps greatly with configuration improvements and |
---|
| 293 | diagnostics on a specific cluster. |
---|
| 294 | </p> |
---|
| 295 | </div> |
---|
| 296 | |
---|
| 297 | |
---|
| 298 | <a name="N1001B"></a><a name="Overview"></a> |
---|
| 299 | <h2 class="h3"> Overview </h2> |
---|
| 300 | <div class="section"> |
---|
| 301 | <p> |
---|
| 302 | HDFS is the primary distributed storage used by Hadoop applications. A |
---|
| 303 | HDFS cluster primarily consists of a NameNode that manages the |
---|
| 304 | file system metadata and DataNodes that store the actual data. The |
---|
| 305 | <a href="hdfs_design.html">HDFS Architecture</a> describes HDFS in detail. This user guide primarily deals with |
---|
| 306 | the interaction of users and administrators with HDFS clusters. |
---|
| 307 | The <a href="images/hdfsarchitecture.gif">HDFS architecture diagram</a> depicts |
---|
| 308 | basic interactions among NameNode, the DataNodes, and the clients. |
---|
| 309 | Clients contact NameNode for file metadata or file modifications and perform |
---|
| 310 | actual file I/O directly with the DataNodes. |
---|
| 311 | </p> |
---|
| 312 | <p> |
---|
| 313 | The following are some of the salient features that could be of |
---|
| 314 | interest to many users. |
---|
| 315 | </p> |
---|
| 316 | <ul> |
---|
| 317 | |
---|
| 318 | <li> |
---|
| 319 | Hadoop, including HDFS, is well suited for distributed storage |
---|
| 320 | and distributed processing using commodity hardware. It is fault |
---|
| 321 | tolerant, scalable, and extremely simple to expand. |
---|
| 322 | <a href="mapred_tutorial.html">Map/Reduce</a>, |
---|
| 323 | well known for its simplicity and applicability for large set of |
---|
| 324 | distributed applications, is an integral part of Hadoop. |
---|
| 325 | </li> |
---|
| 326 | |
---|
| 327 | <li> |
---|
| 328 | HDFS is highly configurable with a default configuration well |
---|
| 329 | suited for many installations. Most of the time, configuration |
---|
| 330 | needs to be tuned only for very large clusters. |
---|
| 331 | </li> |
---|
| 332 | |
---|
| 333 | <li> |
---|
| 334 | Hadoop is written in Java and is supported on all major platforms. |
---|
| 335 | </li> |
---|
| 336 | |
---|
| 337 | <li> |
---|
| 338 | Hadoop supports shell-like commands to interact with HDFS directly. |
---|
| 339 | </li> |
---|
| 340 | |
---|
| 341 | <li> |
---|
| 342 | The NameNode and Datanodes have built in web servers that makes it |
---|
| 343 | easy to check current status of the cluster. |
---|
| 344 | </li> |
---|
| 345 | |
---|
| 346 | <li> |
---|
| 347 | New features and improvements are regularly implemented in HDFS. |
---|
| 348 | The following is a subset of useful features in HDFS: |
---|
| 349 | <ul> |
---|
| 350 | |
---|
| 351 | <li> |
---|
| 352 | File permissions and authentication. |
---|
| 353 | </li> |
---|
| 354 | |
---|
| 355 | <li> |
---|
| 356 | |
---|
| 357 | <em>Rack awareness</em>: to take a node's physical location into |
---|
| 358 | account while scheduling tasks and allocating storage. |
---|
| 359 | </li> |
---|
| 360 | |
---|
| 361 | <li> |
---|
| 362 | Safemode: an administrative mode for maintenance. |
---|
| 363 | </li> |
---|
| 364 | |
---|
| 365 | <li> |
---|
| 366 | |
---|
| 367 | <span class="codefrag">fsck</span>: a utility to diagnose health of the file system, to |
---|
| 368 | find missing files or blocks. |
---|
| 369 | </li> |
---|
| 370 | |
---|
| 371 | <li> |
---|
| 372 | Rebalancer: tool to balance the cluster when the data is |
---|
| 373 | unevenly distributed among DataNodes. |
---|
| 374 | </li> |
---|
| 375 | |
---|
| 376 | <li> |
---|
| 377 | Upgrade and rollback: after a software upgrade, |
---|
| 378 | it is possible to |
---|
| 379 | rollback to HDFS' state before the upgrade in case of unexpected |
---|
| 380 | problems. |
---|
| 381 | </li> |
---|
| 382 | |
---|
| 383 | <li> |
---|
| 384 | Secondary NameNode: performs periodic checkpoints of the |
---|
| 385 | namespace and helps keep the size of file containing log of HDFS |
---|
| 386 | modifications within certain limits at the NameNode. |
---|
| 387 | </li> |
---|
| 388 | |
---|
| 389 | </ul> |
---|
| 390 | |
---|
| 391 | </li> |
---|
| 392 | |
---|
| 393 | </ul> |
---|
| 394 | </div> |
---|
| 395 | <a name="N10067"></a><a name="Pre-requisites"></a> |
---|
| 396 | <h2 class="h3"> Pre-requisites </h2> |
---|
| 397 | <div class="section"> |
---|
| 398 | <p> |
---|
| 399 | The following documents describe installation and set up of a |
---|
| 400 | Hadoop cluster : |
---|
| 401 | </p> |
---|
| 402 | <ul> |
---|
| 403 | |
---|
| 404 | <li> |
---|
| 405 | |
---|
| 406 | <a href="quickstart.html">Hadoop Quick Start</a> |
---|
| 407 | for first-time users. |
---|
| 408 | </li> |
---|
| 409 | |
---|
| 410 | <li> |
---|
| 411 | |
---|
| 412 | <a href="cluster_setup.html">Hadoop Cluster Setup</a> |
---|
| 413 | for large, distributed clusters. |
---|
| 414 | </li> |
---|
| 415 | |
---|
| 416 | </ul> |
---|
| 417 | <p> |
---|
| 418 | The rest of this document assumes the user is able to set up and run a |
---|
| 419 | HDFS with at least one DataNode. For the purpose of this document, |
---|
| 420 | both the NameNode and DataNode could be running on the same physical |
---|
| 421 | machine. |
---|
| 422 | </p> |
---|
| 423 | </div> |
---|
| 424 | <a name="N10085"></a><a name="Web+Interface"></a> |
---|
| 425 | <h2 class="h3"> Web Interface </h2> |
---|
| 426 | <div class="section"> |
---|
| 427 | <p> |
---|
| 428 | NameNode and DataNode each run an internal web server in order to |
---|
| 429 | display basic information about the current status of the cluster. |
---|
| 430 | With the default configuration, the NameNode front page is at |
---|
| 431 | <span class="codefrag">http://namenode-name:50070/</span>. |
---|
| 432 | It lists the DataNodes in the cluster and basic statistics of the |
---|
| 433 | cluster. The web interface can also be used to browse the file |
---|
| 434 | system (using "Browse the file system" link on the NameNode front |
---|
| 435 | page). |
---|
| 436 | </p> |
---|
| 437 | </div> |
---|
| 438 | <a name="N10092"></a><a name="Shell+Commands"></a> |
---|
| 439 | <h2 class="h3">Shell Commands</h2> |
---|
| 440 | <div class="section"> |
---|
| 441 | <p> |
---|
| 442 | Hadoop includes various shell-like commands that directly |
---|
| 443 | interact with HDFS and other file systems that Hadoop supports. |
---|
| 444 | The command |
---|
| 445 | <span class="codefrag">bin/hadoop fs -help</span> |
---|
| 446 | lists the commands supported by Hadoop |
---|
| 447 | shell. Furthermore, the command |
---|
| 448 | <span class="codefrag">bin/hadoop fs -help command-name</span> |
---|
| 449 | displays more detailed help for a command. These commands support |
---|
| 450 | most of the normal files ystem operations like copying files, |
---|
| 451 | changing file permissions, etc. It also supports a few HDFS |
---|
| 452 | specific operations like changing replication of files. |
---|
| 453 | </p> |
---|
| 454 | <a name="N100A1"></a><a name="DFSAdmin+Command"></a> |
---|
| 455 | <h3 class="h4"> DFSAdmin Command </h3> |
---|
| 456 | <p> |
---|
| 457 | The <span class="codefrag">bin/hadoop dfsadmin</span> |
---|
| 458 | command supports a few HDFS administration related operations. |
---|
| 459 | The <span class="codefrag">bin/hadoop dfsadmin -help</span> command |
---|
| 460 | lists all the commands currently supported. For e.g.: |
---|
| 461 | </p> |
---|
| 462 | <ul> |
---|
| 463 | |
---|
| 464 | <li> |
---|
| 465 | |
---|
| 466 | <span class="codefrag">-report</span> |
---|
| 467 | : reports basic statistics of HDFS. Some of this information is |
---|
| 468 | also available on the NameNode front page. |
---|
| 469 | </li> |
---|
| 470 | |
---|
| 471 | <li> |
---|
| 472 | |
---|
| 473 | <span class="codefrag">-safemode</span> |
---|
| 474 | : though usually not required, an administrator can manually enter |
---|
| 475 | or leave Safemode. |
---|
| 476 | </li> |
---|
| 477 | |
---|
| 478 | <li> |
---|
| 479 | |
---|
| 480 | <span class="codefrag">-finalizeUpgrade</span> |
---|
| 481 | : removes previous backup of the cluster made during last upgrade. |
---|
| 482 | </li> |
---|
| 483 | |
---|
| 484 | <li> |
---|
| 485 | |
---|
| 486 | <span class="codefrag">-refreshNodes</span> |
---|
| 487 | : Updates the set of hosts allowed to connect to namenode. |
---|
| 488 | Re-reads the config file to update values defined by dfs.hosts and |
---|
| 489 | dfs.host.exclude and reads the entires (hostnames) in those files. |
---|
| 490 | Each entry not defined in dfs.hosts but in dfs.hosts.exclude |
---|
| 491 | is decommissioned. Each entry defined in dfs.hosts and also in |
---|
| 492 | dfs.host.exclude is stopped from decommissioning if it has aleady |
---|
| 493 | been marked for decommission. Entires not present in both the lists |
---|
| 494 | are decommissioned. |
---|
| 495 | </li> |
---|
| 496 | |
---|
| 497 | </ul> |
---|
| 498 | <p> |
---|
| 499 | For command usage, see <a href="commands_manual.html#dfsadmin">dfsadmin command</a>. |
---|
| 500 | </p> |
---|
| 501 | </div> |
---|
| 502 | <a name="N100D4"></a><a name="Secondary+NameNode"></a> |
---|
| 503 | <h2 class="h3"> Secondary NameNode </h2> |
---|
| 504 | <div class="section"> |
---|
| 505 | <p> |
---|
| 506 | The NameNode stores modifications to the file system as a log |
---|
| 507 | appended to a native file system file (<span class="codefrag">edits</span>). |
---|
| 508 | When a NameNode starts up, it reads HDFS state from an image |
---|
| 509 | file (<span class="codefrag">fsimage</span>) and then applies edits from the |
---|
| 510 | edits log file. It then writes new HDFS state to the <span class="codefrag">fsimage</span> |
---|
| 511 | and starts normal |
---|
| 512 | operation with an empty edits file. Since NameNode merges |
---|
| 513 | <span class="codefrag">fsimage</span> and <span class="codefrag">edits</span> files only during start up, |
---|
| 514 | the edits log file could get very large over time on a busy cluster. |
---|
| 515 | Another side effect of a larger edits file is that next |
---|
| 516 | restart of NameNode takes longer. |
---|
| 517 | </p> |
---|
| 518 | <p> |
---|
| 519 | The secondary NameNode merges the fsimage and the edits log files periodically |
---|
| 520 | and keeps edits log size within a limit. It is usually run on a |
---|
| 521 | different machine than the primary NameNode since its memory requirements |
---|
| 522 | are on the same order as the primary NameNode. The secondary |
---|
| 523 | NameNode is started by <span class="codefrag">bin/start-dfs.sh</span> on the nodes |
---|
| 524 | specified in <span class="codefrag">conf/masters</span> file. |
---|
| 525 | </p> |
---|
| 526 | <p> |
---|
| 527 | The start of the checkpoint process on the secondary NameNode is |
---|
| 528 | controlled by two configuration parameters. |
---|
| 529 | </p> |
---|
| 530 | <ul> |
---|
| 531 | |
---|
| 532 | <li> |
---|
| 533 | |
---|
| 534 | <span class="codefrag">fs.checkpoint.period</span>, set to 1 hour by default, specifies |
---|
| 535 | the maximum delay between two consecutive checkpoints, and |
---|
| 536 | </li> |
---|
| 537 | |
---|
| 538 | <li> |
---|
| 539 | |
---|
| 540 | <span class="codefrag">fs.checkpoint.size</span>, set to 64MB by default, defines the |
---|
| 541 | size of the edits log file that forces an urgent checkpoint even if |
---|
| 542 | the maximum checkpoint delay is not reached. |
---|
| 543 | </li> |
---|
| 544 | |
---|
| 545 | </ul> |
---|
| 546 | <p> |
---|
| 547 | The secondary NameNode stores the latest checkpoint in a |
---|
| 548 | directory which is structured the same way as the primary NameNode's |
---|
| 549 | directory. So that the check pointed image is always ready to be |
---|
| 550 | read by the primary NameNode if necessary. |
---|
| 551 | </p> |
---|
| 552 | <p> |
---|
| 553 | The latest checkpoint can be imported to the primary NameNode if |
---|
| 554 | all other copies of the image and the edits files are lost. |
---|
| 555 | In order to do that one should: |
---|
| 556 | </p> |
---|
| 557 | <ul> |
---|
| 558 | |
---|
| 559 | <li> |
---|
| 560 | Create an empty directory specified in the |
---|
| 561 | <span class="codefrag">dfs.name.dir</span> configuration variable; |
---|
| 562 | </li> |
---|
| 563 | |
---|
| 564 | <li> |
---|
| 565 | Specify the location of the checkpoint directory in the |
---|
| 566 | configuration variable <span class="codefrag">fs.checkpoint.dir</span>; |
---|
| 567 | </li> |
---|
| 568 | |
---|
| 569 | <li> |
---|
| 570 | and start the NameNode with <span class="codefrag">-importCheckpoint</span> option. |
---|
| 571 | </li> |
---|
| 572 | |
---|
| 573 | </ul> |
---|
| 574 | <p> |
---|
| 575 | The NameNode will upload the checkpoint from the |
---|
| 576 | <span class="codefrag">fs.checkpoint.dir</span> directory and then save it to the NameNode |
---|
| 577 | directory(s) set in <span class="codefrag">dfs.name.dir</span>. |
---|
| 578 | The NameNode will fail if a legal image is contained in |
---|
| 579 | <span class="codefrag">dfs.name.dir</span>. |
---|
| 580 | The NameNode verifies that the image in <span class="codefrag">fs.checkpoint.dir</span> is |
---|
| 581 | consistent, but does not modify it in any way. |
---|
| 582 | </p> |
---|
| 583 | <p> |
---|
| 584 | For command usage, see <a href="commands_manual.html#secondarynamenode"><span class="codefrag">secondarynamenode</span> command</a>. |
---|
| 585 | </p> |
---|
| 586 | </div> |
---|
| 587 | <a name="N1013B"></a><a name="Rebalancer"></a> |
---|
| 588 | <h2 class="h3"> Rebalancer </h2> |
---|
| 589 | <div class="section"> |
---|
| 590 | <p> |
---|
| 591 | HDFS data might not always be be placed uniformly across the |
---|
| 592 | DataNode. One common reason is addition of new DataNodes to an |
---|
| 593 | existing cluster. While placing new blocks (data for a file is |
---|
| 594 | stored as a series of blocks), NameNode considers various |
---|
| 595 | parameters before choosing the DataNodes to receive these blocks. |
---|
| 596 | Some of the considerations are: |
---|
| 597 | </p> |
---|
| 598 | <ul> |
---|
| 599 | |
---|
| 600 | <li> |
---|
| 601 | Policy to keep one of the replicas of a block on the same node |
---|
| 602 | as the node that is writing the block. |
---|
| 603 | </li> |
---|
| 604 | |
---|
| 605 | <li> |
---|
| 606 | Need to spread different replicas of a block across the racks so |
---|
| 607 | that cluster can survive loss of whole rack. |
---|
| 608 | </li> |
---|
| 609 | |
---|
| 610 | <li> |
---|
| 611 | One of the replicas is usually placed on the same rack as the |
---|
| 612 | node writing to the file so that cross-rack network I/O is |
---|
| 613 | reduced. |
---|
| 614 | </li> |
---|
| 615 | |
---|
| 616 | <li> |
---|
| 617 | Spread HDFS data uniformly across the DataNodes in the cluster. |
---|
| 618 | </li> |
---|
| 619 | |
---|
| 620 | </ul> |
---|
| 621 | <p> |
---|
| 622 | Due to multiple competing considerations, data might not be |
---|
| 623 | uniformly placed across the DataNodes. |
---|
| 624 | HDFS provides a tool for administrators that analyzes block |
---|
| 625 | placement and rebalanaces data across the DataNode. A brief |
---|
| 626 | administrator's guide for rebalancer as a |
---|
| 627 | <a href="http://issues.apache.org/jira/secure/attachment/12368261/RebalanceDesign6.pdf">PDF</a> |
---|
| 628 | is attached to |
---|
| 629 | <a href="http://issues.apache.org/jira/browse/HADOOP-1652">HADOOP-1652</a>. |
---|
| 630 | </p> |
---|
| 631 | <p> |
---|
| 632 | For command usage, see <a href="commands_manual.html#balancer">balancer command</a>. |
---|
| 633 | </p> |
---|
| 634 | </div> |
---|
| 635 | <a name="N10166"></a><a name="Rack+Awareness"></a> |
---|
| 636 | <h2 class="h3"> Rack Awareness </h2> |
---|
| 637 | <div class="section"> |
---|
| 638 | <p> |
---|
| 639 | Typically large Hadoop clusters are arranged in racks and |
---|
| 640 | network traffic between different nodes with in the same rack is |
---|
| 641 | much more desirable than network traffic across the racks. In |
---|
| 642 | addition NameNode tries to place replicas of block on |
---|
| 643 | multiple racks for improved fault tolerance. Hadoop lets the |
---|
| 644 | cluster administrators decide which rack a node belongs to |
---|
| 645 | through configuration variable <span class="codefrag">dfs.network.script</span>. When this |
---|
| 646 | script is configured, each node runs the script to determine its |
---|
| 647 | rack id. A default installation assumes all the nodes belong to |
---|
| 648 | the same rack. This feature and configuration is further described |
---|
| 649 | in <a href="http://issues.apache.org/jira/secure/attachment/12345251/Rack_aware_HDFS_proposal.pdf">PDF</a> |
---|
| 650 | attached to |
---|
| 651 | <a href="http://issues.apache.org/jira/browse/HADOOP-692">HADOOP-692</a>. |
---|
| 652 | </p> |
---|
| 653 | </div> |
---|
| 654 | <a name="N1017B"></a><a name="Safemode"></a> |
---|
| 655 | <h2 class="h3"> Safemode </h2> |
---|
| 656 | <div class="section"> |
---|
| 657 | <p> |
---|
| 658 | During start up the NameNode loads the file system state from the |
---|
| 659 | fsimage and the edits log file. It then waits for DataNodes |
---|
| 660 | to report their blocks so that it does not prematurely start |
---|
| 661 | replicating the blocks though enough replicas already exist in the |
---|
| 662 | cluster. During this time NameNode stays in Safemode. |
---|
| 663 | Safemode |
---|
| 664 | for the NameNode is essentially a read-only mode for the HDFS cluster, |
---|
| 665 | where it does not allow any modifications to file system or blocks. |
---|
| 666 | Normally the NameNode leaves Safemode automatically after the DataNodes |
---|
| 667 | have reported that most file system blocks are available. |
---|
| 668 | If required, HDFS could be placed in Safemode explicitly |
---|
| 669 | using <span class="codefrag">'bin/hadoop dfsadmin -safemode'</span> command. NameNode front |
---|
| 670 | page shows whether Safemode is on or off. A more detailed |
---|
| 671 | description and configuration is maintained as JavaDoc for |
---|
| 672 | <a href="http://hadoop.apache.org/core/docs/current/api/org/apache/hadoop/dfs/NameNode.html#setSafeMode(org.apache.hadoop.dfs.FSConstants.SafeModeAction)"><span class="codefrag">setSafeMode()</span></a>. |
---|
| 673 | </p> |
---|
| 674 | </div> |
---|
| 675 | <a name="N1018D"></a><a name="fsck"></a> |
---|
| 676 | <h2 class="h3"> fsck </h2> |
---|
| 677 | <div class="section"> |
---|
| 678 | <p> |
---|
| 679 | HDFS supports the <span class="codefrag">fsck</span> command to check for various |
---|
| 680 | inconsistencies. |
---|
| 681 | It it is designed for reporting problems with various |
---|
| 682 | files, for example, missing blocks for a file or under-replicated |
---|
| 683 | blocks. Unlike a traditional <span class="codefrag">fsck</span> utility for native file systems, |
---|
| 684 | this command does not correct the errors it detects. Normally NameNode |
---|
| 685 | automatically corrects most of the recoverable failures. By default |
---|
| 686 | <span class="codefrag">fsck</span> ignores open files but provides an option to select all files during reporting. |
---|
| 687 | The HDFS <span class="codefrag">fsck</span> command is not a |
---|
| 688 | Hadoop shell command. It can be run as '<span class="codefrag">bin/hadoop fsck</span>'. |
---|
| 689 | For command usage, see <a href="commands_manual.html#fsck"><span class="codefrag">fsck</span> command</a>. |
---|
| 690 | <span class="codefrag">fsck</span> can be run on the whole file system or on a subset of files. |
---|
| 691 | </p> |
---|
| 692 | </div> |
---|
| 693 | <a name="N101AF"></a><a name="Upgrade+and+Rollback"></a> |
---|
| 694 | <h2 class="h3"> Upgrade and Rollback </h2> |
---|
| 695 | <div class="section"> |
---|
| 696 | <p> |
---|
| 697 | When Hadoop is upgraded on an existing cluster, as with any |
---|
| 698 | software upgrade, it is possible there are new bugs or |
---|
| 699 | incompatible changes that affect existing applications and were |
---|
| 700 | not discovered earlier. In any non-trivial HDFS installation, it |
---|
| 701 | is not an option to loose any data, let alone to restart HDFS from |
---|
| 702 | scratch. HDFS allows administrators to go back to earlier version |
---|
| 703 | of Hadoop and rollback the cluster to the state it was in |
---|
| 704 | before |
---|
| 705 | the upgrade. HDFS upgrade is described in more detail in |
---|
| 706 | <a href="http://wiki.apache.org/hadoop/Hadoop%20Upgrade">upgrade wiki</a>. |
---|
| 707 | HDFS can have one such backup at a time. Before upgrading, |
---|
| 708 | administrators need to remove existing backup using <span class="codefrag">bin/hadoop |
---|
| 709 | dfsadmin -finalizeUpgrade</span> command. The following |
---|
| 710 | briefly describes the typical upgrade procedure: |
---|
| 711 | </p> |
---|
| 712 | <ul> |
---|
| 713 | |
---|
| 714 | <li> |
---|
| 715 | Before upgrading Hadoop software, |
---|
| 716 | <em>finalize</em> if there an existing backup. |
---|
| 717 | <span class="codefrag">dfsadmin -upgradeProgress status</span> |
---|
| 718 | can tell if the cluster needs to be <em>finalized</em>. |
---|
| 719 | </li> |
---|
| 720 | |
---|
| 721 | <li>Stop the cluster and distribute new version of Hadoop.</li> |
---|
| 722 | |
---|
| 723 | <li> |
---|
| 724 | Run the new version with <span class="codefrag">-upgrade</span> option |
---|
| 725 | (<span class="codefrag">bin/start-dfs.sh -upgrade</span>). |
---|
| 726 | </li> |
---|
| 727 | |
---|
| 728 | <li> |
---|
| 729 | Most of the time, cluster works just fine. Once the new HDFS is |
---|
| 730 | considered working well (may be after a few days of operation), |
---|
| 731 | finalize the upgrade. Note that until the cluster is finalized, |
---|
| 732 | deleting the files that existed before the upgrade does not free |
---|
| 733 | up real disk space on the DataNodes. |
---|
| 734 | </li> |
---|
| 735 | |
---|
| 736 | <li> |
---|
| 737 | If there is a need to move back to the old version, |
---|
| 738 | <ul> |
---|
| 739 | |
---|
| 740 | <li> stop the cluster and distribute earlier version of Hadoop. </li> |
---|
| 741 | |
---|
| 742 | <li> start the cluster with rollback option. |
---|
| 743 | (<span class="codefrag">bin/start-dfs.h -rollback</span>). |
---|
| 744 | </li> |
---|
| 745 | |
---|
| 746 | </ul> |
---|
| 747 | |
---|
| 748 | </li> |
---|
| 749 | |
---|
| 750 | </ul> |
---|
| 751 | </div> |
---|
| 752 | <a name="N101ED"></a><a name="File+Permissions+and+Security"></a> |
---|
| 753 | <h2 class="h3"> File Permissions and Security </h2> |
---|
| 754 | <div class="section"> |
---|
| 755 | <p> |
---|
| 756 | The file permissions are designed to be similar to file permissions on |
---|
| 757 | other familiar platforms like Linux. Currently, security is limited |
---|
| 758 | to simple file permissions. The user that starts NameNode is |
---|
| 759 | treated as the superuser for HDFS. Future versions of HDFS will |
---|
| 760 | support network authentication protocols like Kerberos for user |
---|
| 761 | authentication and encryption of data transfers. The details are discussed in the |
---|
| 762 | <a href="hdfs_permissions_guide.html">HDFS Admin Guide: Permissions</a>. |
---|
| 763 | </p> |
---|
| 764 | </div> |
---|
| 765 | <a name="N101FB"></a><a name="Scalability"></a> |
---|
| 766 | <h2 class="h3"> Scalability </h2> |
---|
| 767 | <div class="section"> |
---|
| 768 | <p> |
---|
| 769 | Hadoop currently runs on clusters with thousands of nodes. |
---|
| 770 | <a href="http://wiki.apache.org/hadoop/PoweredBy">Powered By Hadoop</a> |
---|
| 771 | lists some of the organizations that deploy Hadoop on large |
---|
| 772 | clusters. HDFS has one NameNode for each cluster. Currently |
---|
| 773 | the total memory available on NameNode is the primary scalability |
---|
| 774 | limitation. On very large clusters, increasing average size of |
---|
| 775 | files stored in HDFS helps with increasing cluster size without |
---|
| 776 | increasing memory requirements on NameNode. |
---|
| 777 | |
---|
| 778 | The default configuration may not suite very large clustes. |
---|
| 779 | <a href="http://wiki.apache.org/hadoop/FAQ">Hadoop FAQ</a> page lists |
---|
| 780 | suggested configuration improvements for large Hadoop clusters. |
---|
| 781 | </p> |
---|
| 782 | </div> |
---|
| 783 | <a name="N1020D"></a><a name="Related+Documentation"></a> |
---|
| 784 | <h2 class="h3"> Related Documentation </h2> |
---|
| 785 | <div class="section"> |
---|
| 786 | <p> |
---|
| 787 | This user guide is a good starting point for |
---|
| 788 | working with HDFS. While the user guide continues to improve, |
---|
| 789 | there is a large wealth of documentation about Hadoop and HDFS. |
---|
| 790 | The following list is a starting point for further exploration: |
---|
| 791 | </p> |
---|
| 792 | <ul> |
---|
| 793 | |
---|
| 794 | <li> |
---|
| 795 | |
---|
| 796 | <a href="http://hadoop.apache.org/">Hadoop Home Page</a>: The start page for everything Hadoop. |
---|
| 797 | </li> |
---|
| 798 | |
---|
| 799 | <li> |
---|
| 800 | |
---|
| 801 | <a href="http://wiki.apache.org/hadoop/FrontPage">Hadoop Wiki</a> |
---|
| 802 | : Front page for Hadoop Wiki documentation. Unlike this |
---|
| 803 | guide which is part of Hadoop source tree, Hadoop Wiki is |
---|
| 804 | regularly edited by Hadoop Community. |
---|
| 805 | </li> |
---|
| 806 | |
---|
| 807 | <li> |
---|
| 808 | <a href="http://wiki.apache.org/hadoop/FAQ">FAQ</a> from Hadoop Wiki. |
---|
| 809 | </li> |
---|
| 810 | |
---|
| 811 | <li> |
---|
| 812 | Hadoop <a href="api/">JavaDoc API</a>. |
---|
| 813 | </li> |
---|
| 814 | |
---|
| 815 | <li> |
---|
| 816 | Hadoop User Mailing List : |
---|
| 817 | <a href="mailto:core-user@hadoop.apache.org">core-user[at]hadoop.apache.org</a>. |
---|
| 818 | </li> |
---|
| 819 | |
---|
| 820 | <li> |
---|
| 821 | Explore <span class="codefrag">src/hdfs/hdfs-default.xml</span>. |
---|
| 822 | It includes brief |
---|
| 823 | description of most of the configuration variables available. |
---|
| 824 | </li> |
---|
| 825 | |
---|
| 826 | <li> |
---|
| 827 | |
---|
| 828 | <a href="commands_manual.html">Hadoop Command Guide</a>: commands usage. |
---|
| 829 | </li> |
---|
| 830 | |
---|
| 831 | </ul> |
---|
| 832 | </div> |
---|
| 833 | |
---|
| 834 | |
---|
| 835 | </div> |
---|
| 836 | <!--+ |
---|
| 837 | |end content |
---|
| 838 | +--> |
---|
| 839 | <div class="clearboth"> </div> |
---|
| 840 | </div> |
---|
| 841 | <div id="footer"> |
---|
| 842 | <!--+ |
---|
| 843 | |start bottomstrip |
---|
| 844 | +--> |
---|
| 845 | <div class="lastmodified"> |
---|
| 846 | <script type="text/javascript"><!-- |
---|
| 847 | document.write("Last Published: " + document.lastModified); |
---|
| 848 | // --></script> |
---|
| 849 | </div> |
---|
| 850 | <div class="copyright"> |
---|
| 851 | Copyright © |
---|
| 852 | 2008 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a> |
---|
| 853 | </div> |
---|
| 854 | <!--+ |
---|
| 855 | |end bottomstrip |
---|
| 856 | +--> |
---|
| 857 | </div> |
---|
| 858 | </body> |
---|
| 859 | </html> |
---|