1 | /** |
---|
2 | * Licensed to the Apache Software Foundation (ASF) under one |
---|
3 | * or more contributor license agreements. See the NOTICE file |
---|
4 | * distributed with this work for additional information |
---|
5 | * regarding copyright ownership. The ASF licenses this file |
---|
6 | * to you under the Apache License, Version 2.0 (the |
---|
7 | * "License"); you may not use this file except in compliance |
---|
8 | * with the License. You may obtain a copy of the License at |
---|
9 | * |
---|
10 | * http://www.apache.org/licenses/LICENSE-2.0 |
---|
11 | * |
---|
12 | * Unless required by applicable law or agreed to in writing, software |
---|
13 | * distributed under the License is distributed on an "AS IS" BASIS, |
---|
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
---|
15 | * See the License for the specific language governing permissions and |
---|
16 | * limitations under the License. |
---|
17 | */ |
---|
18 | |
---|
19 | package org.apache.hadoop.hdfs; |
---|
20 | |
---|
21 | import java.io.*; |
---|
22 | import java.nio.channels.FileChannel; |
---|
23 | import java.nio.ByteBuffer; |
---|
24 | import java.util.Random; |
---|
25 | import junit.framework.*; |
---|
26 | import org.apache.hadoop.conf.Configuration; |
---|
27 | import org.apache.hadoop.fs.FileSystem; |
---|
28 | import org.apache.hadoop.fs.LocalFileSystem; |
---|
29 | import org.apache.hadoop.fs.ChecksumException; |
---|
30 | import org.apache.hadoop.fs.Path; |
---|
31 | |
---|
32 | /** |
---|
33 | * A JUnit test for corrupted file handling. |
---|
34 | * This test creates a bunch of files/directories with replication |
---|
35 | * factor of 2. Then verifies that a client can automatically |
---|
36 | * access the remaining valid replica inspite of the following |
---|
37 | * types of simulated errors: |
---|
38 | * |
---|
39 | * 1. Delete meta file on one replica |
---|
40 | * 2. Truncates meta file on one replica |
---|
41 | * 3. Corrupts the meta file header on one replica |
---|
42 | * 4. Corrupts any random offset and portion of the meta file |
---|
43 | * 5. Swaps two meta files, i.e the format of the meta files |
---|
44 | * are valid but their CRCs do not match with their corresponding |
---|
45 | * data blocks |
---|
46 | * The above tests are run for varied values of io.bytes.per.checksum |
---|
47 | * and dfs.block.size. It tests for the case when the meta file is |
---|
48 | * multiple blocks. |
---|
49 | * |
---|
50 | * Another portion of the test is commented out till HADOOP-1557 |
---|
51 | * is addressed: |
---|
52 | * 1. Create file with 2 replica, corrupt the meta file of replica, |
---|
53 | * decrease replication factor from 2 to 1. Validate that the |
---|
54 | * remaining replica is the good one. |
---|
55 | * 2. Create file with 2 replica, corrupt the meta file of one replica, |
---|
56 | * increase replication factor of file to 3. verify that the new |
---|
57 | * replica was created from the non-corrupted replica. |
---|
58 | */ |
---|
59 | public class TestCrcCorruption extends TestCase { |
---|
60 | |
---|
61 | public TestCrcCorruption(String testName) { |
---|
62 | super(testName); |
---|
63 | } |
---|
64 | |
---|
65 | protected void setUp() throws Exception { |
---|
66 | } |
---|
67 | |
---|
68 | protected void tearDown() throws Exception { |
---|
69 | } |
---|
70 | |
---|
71 | /** |
---|
72 | * check if DFS can handle corrupted CRC blocks |
---|
73 | */ |
---|
74 | private void thistest(Configuration conf, DFSTestUtil util) throws Exception { |
---|
75 | MiniDFSCluster cluster = null; |
---|
76 | int numDataNodes = 2; |
---|
77 | short replFactor = 2; |
---|
78 | Random random = new Random(); |
---|
79 | |
---|
80 | try { |
---|
81 | cluster = new MiniDFSCluster(conf, numDataNodes, true, null); |
---|
82 | cluster.waitActive(); |
---|
83 | FileSystem fs = cluster.getFileSystem(); |
---|
84 | util.createFiles(fs, "/srcdat", replFactor); |
---|
85 | util.waitReplication(fs, "/srcdat", (short)2); |
---|
86 | |
---|
87 | // Now deliberately remove/truncate meta blocks from the first |
---|
88 | // directory of the first datanode. The complete absense of a meta |
---|
89 | // file disallows this Datanode to send data to another datanode. |
---|
90 | // However, a client is alowed access to this block. |
---|
91 | // |
---|
92 | File data_dir = new File(System.getProperty("test.build.data"), |
---|
93 | "dfs/data/data1/current"); |
---|
94 | assertTrue("data directory does not exist", data_dir.exists()); |
---|
95 | File[] blocks = data_dir.listFiles(); |
---|
96 | assertTrue("Blocks do not exist in data-dir", (blocks != null) && (blocks.length > 0)); |
---|
97 | int num = 0; |
---|
98 | for (int idx = 0; idx < blocks.length; idx++) { |
---|
99 | if (blocks[idx].getName().startsWith("blk_") && |
---|
100 | blocks[idx].getName().endsWith(".meta")) { |
---|
101 | num++; |
---|
102 | if (num % 3 == 0) { |
---|
103 | // |
---|
104 | // remove .meta file |
---|
105 | // |
---|
106 | System.out.println("Deliberately removing file " + blocks[idx].getName()); |
---|
107 | assertTrue("Cannot remove file.", blocks[idx].delete()); |
---|
108 | } else if (num % 3 == 1) { |
---|
109 | // |
---|
110 | // shorten .meta file |
---|
111 | // |
---|
112 | RandomAccessFile file = new RandomAccessFile(blocks[idx], "rw"); |
---|
113 | FileChannel channel = file.getChannel(); |
---|
114 | int newsize = random.nextInt((int)channel.size()/2); |
---|
115 | System.out.println("Deliberately truncating file " + |
---|
116 | blocks[idx].getName() + |
---|
117 | " to size " + newsize + " bytes."); |
---|
118 | channel.truncate(newsize); |
---|
119 | file.close(); |
---|
120 | } else { |
---|
121 | // |
---|
122 | // corrupt a few bytes of the metafile |
---|
123 | // |
---|
124 | RandomAccessFile file = new RandomAccessFile(blocks[idx], "rw"); |
---|
125 | FileChannel channel = file.getChannel(); |
---|
126 | long position = 0; |
---|
127 | // |
---|
128 | // The very first time, corrupt the meta header at offset 0 |
---|
129 | // |
---|
130 | if (num != 2) { |
---|
131 | position = (long)random.nextInt((int)channel.size()); |
---|
132 | } |
---|
133 | int length = random.nextInt((int)(channel.size() - position + 1)); |
---|
134 | byte[] buffer = new byte[length]; |
---|
135 | random.nextBytes(buffer); |
---|
136 | channel.write(ByteBuffer.wrap(buffer), position); |
---|
137 | System.out.println("Deliberately corrupting file " + |
---|
138 | blocks[idx].getName() + |
---|
139 | " at offset " + position + |
---|
140 | " length " + length); |
---|
141 | file.close(); |
---|
142 | } |
---|
143 | } |
---|
144 | } |
---|
145 | // |
---|
146 | // Now deliberately corrupt all meta blocks from the second |
---|
147 | // directory of the first datanode |
---|
148 | // |
---|
149 | data_dir = new File(System.getProperty("test.build.data"), |
---|
150 | "dfs/data/data2/current"); |
---|
151 | assertTrue("data directory does not exist", data_dir.exists()); |
---|
152 | blocks = data_dir.listFiles(); |
---|
153 | assertTrue("Blocks do not exist in data-dir", (blocks != null) && (blocks.length > 0)); |
---|
154 | |
---|
155 | int count = 0; |
---|
156 | File previous = null; |
---|
157 | for (int idx = 0; idx < blocks.length; idx++) { |
---|
158 | if (blocks[idx].getName().startsWith("blk_") && |
---|
159 | blocks[idx].getName().endsWith(".meta")) { |
---|
160 | // |
---|
161 | // Move the previous metafile into the current one. |
---|
162 | // |
---|
163 | count++; |
---|
164 | if (count % 2 == 0) { |
---|
165 | System.out.println("Deliberately insertimg bad crc into files " + |
---|
166 | blocks[idx].getName() + " " + previous.getName()); |
---|
167 | assertTrue("Cannot remove file.", blocks[idx].delete()); |
---|
168 | assertTrue("Cannot corrupt meta file.", previous.renameTo(blocks[idx])); |
---|
169 | assertTrue("Cannot recreate empty meta file.", previous.createNewFile()); |
---|
170 | previous = null; |
---|
171 | } else { |
---|
172 | previous = blocks[idx]; |
---|
173 | } |
---|
174 | } |
---|
175 | } |
---|
176 | |
---|
177 | // |
---|
178 | // Only one replica is possibly corrupted. The other replica should still |
---|
179 | // be good. Verify. |
---|
180 | // |
---|
181 | assertTrue("Corrupted replicas not handled properly.", |
---|
182 | util.checkFiles(fs, "/srcdat")); |
---|
183 | System.out.println("All File still have a valid replica"); |
---|
184 | |
---|
185 | // |
---|
186 | // set replication factor back to 1. This causes only one replica of |
---|
187 | // of each block to remain in HDFS. The check is to make sure that |
---|
188 | // the corrupted replica generated above is the one that gets deleted. |
---|
189 | // This test is currently disabled until HADOOP-1557 is solved. |
---|
190 | // |
---|
191 | util.setReplication(fs, "/srcdat", (short)1); |
---|
192 | //util.waitReplication(fs, "/srcdat", (short)1); |
---|
193 | //System.out.println("All Files done with removing replicas"); |
---|
194 | //assertTrue("Excess replicas deleted. Corrupted replicas found.", |
---|
195 | // util.checkFiles(fs, "/srcdat")); |
---|
196 | System.out.println("The excess-corrupted-replica test is disabled " + |
---|
197 | " pending HADOOP-1557"); |
---|
198 | |
---|
199 | util.cleanup(fs, "/srcdat"); |
---|
200 | } finally { |
---|
201 | if (cluster != null) { cluster.shutdown(); } |
---|
202 | } |
---|
203 | } |
---|
204 | |
---|
205 | public void testCrcCorruption() throws Exception { |
---|
206 | // |
---|
207 | // default parameters |
---|
208 | // |
---|
209 | System.out.println("TestCrcCorruption with default parameters"); |
---|
210 | Configuration conf1 = new Configuration(); |
---|
211 | conf1.setInt("dfs.blockreport.intervalMsec", 3 * 1000); |
---|
212 | DFSTestUtil util1 = new DFSTestUtil("TestCrcCorruption", 40, 3, 8*1024); |
---|
213 | thistest(conf1, util1); |
---|
214 | |
---|
215 | // |
---|
216 | // specific parameters |
---|
217 | // |
---|
218 | System.out.println("TestCrcCorruption with specific parameters"); |
---|
219 | Configuration conf2 = new Configuration(); |
---|
220 | conf2.setInt("io.bytes.per.checksum", 17); |
---|
221 | conf2.setInt("dfs.block.size", 34); |
---|
222 | DFSTestUtil util2 = new DFSTestUtil("TestCrcCorruption", 40, 3, 400); |
---|
223 | thistest(conf2, util2); |
---|
224 | } |
---|
225 | } |
---|