1 | /** |
---|
2 | * Licensed to the Apache Software Foundation (ASF) under one |
---|
3 | * or more contributor license agreements. See the NOTICE file |
---|
4 | * distributed with this work for additional information |
---|
5 | * regarding copyright ownership. The ASF licenses this file |
---|
6 | * to you under the Apache License, Version 2.0 (the |
---|
7 | * "License"); you may not use this file except in compliance |
---|
8 | * with the License. You may obtain a copy of the License at |
---|
9 | * |
---|
10 | * http://www.apache.org/licenses/LICENSE-2.0 |
---|
11 | * |
---|
12 | * Unless required by applicable law or agreed to in writing, software |
---|
13 | * distributed under the License is distributed on an "AS IS" BASIS, |
---|
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
---|
15 | * See the License for the specific language governing permissions and |
---|
16 | * limitations under the License. |
---|
17 | */ |
---|
18 | package org.apache.hadoop.mapred; |
---|
19 | |
---|
20 | import java.io.IOException; |
---|
21 | import java.util.BitSet; |
---|
22 | import java.util.HashMap; |
---|
23 | import java.util.Random; |
---|
24 | |
---|
25 | import junit.framework.TestCase; |
---|
26 | |
---|
27 | import org.apache.commons.logging.Log; |
---|
28 | import org.apache.commons.logging.LogFactory; |
---|
29 | import org.apache.hadoop.fs.FSDataOutputStream; |
---|
30 | import org.apache.hadoop.fs.FileSystem; |
---|
31 | import org.apache.hadoop.fs.Path; |
---|
32 | import org.apache.hadoop.io.Text; |
---|
33 | |
---|
34 | public class TestMultiFileInputFormat extends TestCase{ |
---|
35 | |
---|
36 | private static JobConf job = new JobConf(); |
---|
37 | |
---|
38 | private static final Log LOG = LogFactory.getLog(TestMultiFileInputFormat.class); |
---|
39 | |
---|
40 | private static final int MAX_SPLIT_COUNT = 10000; |
---|
41 | private static final int SPLIT_COUNT_INCR = 6000; |
---|
42 | private static final int MAX_BYTES = 1024; |
---|
43 | private static final int MAX_NUM_FILES = 10000; |
---|
44 | private static final int NUM_FILES_INCR = 8000; |
---|
45 | |
---|
46 | private Random rand = new Random(System.currentTimeMillis()); |
---|
47 | private HashMap<String, Long> lengths = new HashMap<String, Long>(); |
---|
48 | |
---|
49 | /** Dummy class to extend MultiFileInputFormat*/ |
---|
50 | private class DummyMultiFileInputFormat extends MultiFileInputFormat<Text, Text> { |
---|
51 | @Override |
---|
52 | public RecordReader<Text,Text> getRecordReader(InputSplit split, JobConf job |
---|
53 | , Reporter reporter) throws IOException { |
---|
54 | return null; |
---|
55 | } |
---|
56 | } |
---|
57 | |
---|
58 | private Path initFiles(FileSystem fs, int numFiles, int numBytes) throws IOException{ |
---|
59 | Path dir = new Path(System.getProperty("test.build.data",".") + "/mapred"); |
---|
60 | Path multiFileDir = new Path(dir, "test.multifile"); |
---|
61 | fs.delete(multiFileDir, true); |
---|
62 | fs.mkdirs(multiFileDir); |
---|
63 | LOG.info("Creating " + numFiles + " file(s) in " + multiFileDir); |
---|
64 | for(int i=0; i<numFiles ;i++) { |
---|
65 | Path path = new Path(multiFileDir, "file_" + i); |
---|
66 | FSDataOutputStream out = fs.create(path); |
---|
67 | if (numBytes == -1) { |
---|
68 | numBytes = rand.nextInt(MAX_BYTES); |
---|
69 | } |
---|
70 | for(int j=0; j< numBytes; j++) { |
---|
71 | out.write(rand.nextInt()); |
---|
72 | } |
---|
73 | out.close(); |
---|
74 | if(LOG.isDebugEnabled()) { |
---|
75 | LOG.debug("Created file " + path + " with length " + numBytes); |
---|
76 | } |
---|
77 | lengths.put(path.getName(), new Long(numBytes)); |
---|
78 | } |
---|
79 | FileInputFormat.setInputPaths(job, multiFileDir); |
---|
80 | return multiFileDir; |
---|
81 | } |
---|
82 | |
---|
83 | public void testFormat() throws IOException { |
---|
84 | if(LOG.isInfoEnabled()) { |
---|
85 | LOG.info("Test started"); |
---|
86 | LOG.info("Max split count = " + MAX_SPLIT_COUNT); |
---|
87 | LOG.info("Split count increment = " + SPLIT_COUNT_INCR); |
---|
88 | LOG.info("Max bytes per file = " + MAX_BYTES); |
---|
89 | LOG.info("Max number of files = " + MAX_NUM_FILES); |
---|
90 | LOG.info("Number of files increment = " + NUM_FILES_INCR); |
---|
91 | } |
---|
92 | |
---|
93 | MultiFileInputFormat<Text,Text> format = new DummyMultiFileInputFormat(); |
---|
94 | FileSystem fs = FileSystem.getLocal(job); |
---|
95 | |
---|
96 | for(int numFiles = 1; numFiles< MAX_NUM_FILES ; |
---|
97 | numFiles+= (NUM_FILES_INCR / 2) + rand.nextInt(NUM_FILES_INCR / 2)) { |
---|
98 | |
---|
99 | Path dir = initFiles(fs, numFiles, -1); |
---|
100 | BitSet bits = new BitSet(numFiles); |
---|
101 | for(int i=1;i< MAX_SPLIT_COUNT ;i+= rand.nextInt(SPLIT_COUNT_INCR) + 1) { |
---|
102 | LOG.info("Running for Num Files=" + numFiles + ", split count=" + i); |
---|
103 | |
---|
104 | MultiFileSplit[] splits = (MultiFileSplit[])format.getSplits(job, i); |
---|
105 | bits.clear(); |
---|
106 | |
---|
107 | for(MultiFileSplit split : splits) { |
---|
108 | long splitLength = 0; |
---|
109 | for(Path p : split.getPaths()) { |
---|
110 | long length = fs.getContentSummary(p).getLength(); |
---|
111 | assertEquals(length, lengths.get(p.getName()).longValue()); |
---|
112 | splitLength += length; |
---|
113 | String name = p.getName(); |
---|
114 | int index = Integer.parseInt( |
---|
115 | name.substring(name.lastIndexOf("file_") + 5)); |
---|
116 | assertFalse(bits.get(index)); |
---|
117 | bits.set(index); |
---|
118 | } |
---|
119 | assertEquals(splitLength, split.getLength()); |
---|
120 | } |
---|
121 | } |
---|
122 | assertEquals(bits.cardinality(), numFiles); |
---|
123 | fs.delete(dir, true); |
---|
124 | } |
---|
125 | LOG.info("Test Finished"); |
---|
126 | } |
---|
127 | |
---|
128 | public void testFormatWithLessPathsThanSplits() throws Exception { |
---|
129 | MultiFileInputFormat<Text,Text> format = new DummyMultiFileInputFormat(); |
---|
130 | FileSystem fs = FileSystem.getLocal(job); |
---|
131 | |
---|
132 | // Test with no path |
---|
133 | initFiles(fs, 0, -1); |
---|
134 | assertEquals(0, format.getSplits(job, 2).length); |
---|
135 | |
---|
136 | // Test with 2 path and 4 splits |
---|
137 | initFiles(fs, 2, 500); |
---|
138 | assertEquals(2, format.getSplits(job, 4).length); |
---|
139 | } |
---|
140 | |
---|
141 | public static void main(String[] args) throws Exception{ |
---|
142 | TestMultiFileInputFormat test = new TestMultiFileInputFormat(); |
---|
143 | test.testFormat(); |
---|
144 | } |
---|
145 | } |
---|