Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

ThreadedMapBenchmark.java @ 120

Last change on this file since 120 was 120, checked in by (none), 14 years ago
Added the mail files for the Hadoop JUNit Project
Property svn:executable set to ``*
File size: 11.2 KB

Line
1	/**
2	* Licensed to the Apache Software Foundation (ASF) under one
3	* or more contributor license agreements. See the NOTICE file
4	* distributed with this work for additional information
5	* regarding copyright ownership. The ASF licenses this file
6	* to you under the Apache License, Version 2.0 (the
7	* "License"); you may not use this file except in compliance
8	* with the License. You may obtain a copy of the License at
9	*
10	* http://www.apache.org/licenses/LICENSE-2.0
11	*
12	* Unless required by applicable law or agreed to in writing, software
13	* distributed under the License is distributed on an "AS IS" BASIS,
14	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15	* See the License for the specific language governing permissions and
16	* limitations under the License.
17	*/
18
19	package org.apache.hadoop.mapred;
20
21	import java.io.IOException;
22	import java.io.File;
23	import java.util.Random;
24
25	import org.apache.commons.logging.Log;
26	import org.apache.commons.logging.LogFactory;
27	import org.apache.hadoop.conf.Configured;
28	import org.apache.hadoop.examples.RandomWriter;
29	import org.apache.hadoop.fs.FileSystem;
30	import org.apache.hadoop.fs.Path;
31	import org.apache.hadoop.io.BytesWritable;
32	import org.apache.hadoop.io.Text;
33	import org.apache.hadoop.io.Writable;
34	import org.apache.hadoop.io.WritableComparable;
35	import org.apache.hadoop.mapred.SortValidator.RecordStatsChecker.NonSplitableSequenceFileInputFormat;
36	import org.apache.hadoop.mapred.lib.IdentityMapper;
37	import org.apache.hadoop.mapred.lib.IdentityReducer;
38	import org.apache.hadoop.util.Tool;
39	import org.apache.hadoop.util.ToolRunner;
40
41	/**
42	* Distributed threaded map benchmark.
43	* <p>
44	* This benchmark generates random data per map and tests the performance
45	* of having multiple spills (using multiple threads) over having just one
46	* spill. Following are the parameters that can be specified
47	* <li>File size per map.
48	* <li>Number of spills per map.
49	* <li>Number of maps per host.
50	* <p>
51	* Sort is used for benchmarking the performance.
52	*/
53
54	public class ThreadedMapBenchmark extends Configured implements Tool {
55
56	private static final Log LOG = LogFactory.getLog(ThreadedMapBenchmark.class);
57	private static Path BASE_DIR =
58	new Path(System.getProperty("test.build.data",
59	File.separator + "benchmarks" + File.separator
60	+ "ThreadedMapBenchmark"));
61	private static Path INPUT_DIR = new Path(BASE_DIR, "input");
62	private static Path OUTPUT_DIR = new Path(BASE_DIR, "output");
63	private static final float FACTOR = 2.3f; // io.sort.mb set to
64	// (FACTOR * data_size) should
65	// result in only 1 spill
66
67	static enum Counters { RECORDS_WRITTEN, BYTES_WRITTEN }
68
69	/**
70	* Generates random input data of given size with keys and values of given
71	* sizes. By default it generates 128mb input data with 10 byte keys and 10
72	* byte values.
73	*/
74	public static class Map extends MapReduceBase
75	implements Mapper<WritableComparable, Writable,
76	BytesWritable, BytesWritable> {
77
78	private long numBytesToWrite;
79	private int minKeySize;
80	private int keySizeRange;
81	private int minValueSize;
82	private int valueSizeRange;
83	private Random random = new Random();
84	private BytesWritable randomKey = new BytesWritable();
85	private BytesWritable randomValue = new BytesWritable();
86
87	private void randomizeBytes(byte[] data, int offset, int length) {
88	for(int i = offset + length - 1; i >= offset; --i) {
89	data[i] = (byte) random.nextInt(256);
90	}
91	}
92
93	public void map(WritableComparable key,
94	Writable value,
95	OutputCollector<BytesWritable, BytesWritable> output,
96	Reporter reporter) throws IOException {
97	int itemCount = 0;
98	while (numBytesToWrite > 0) {
99	int keyLength = minKeySize
100	+ (keySizeRange != 0
101	? random.nextInt(keySizeRange)
102	: 0);
103	randomKey.setSize(keyLength);
104	randomizeBytes(randomKey.getBytes(), 0, randomKey.getLength());
105	int valueLength = minValueSize
106	+ (valueSizeRange != 0
107	? random.nextInt(valueSizeRange)
108	: 0);
109	randomValue.setSize(valueLength);
110	randomizeBytes(randomValue.getBytes(), 0, randomValue.getLength());
111	output.collect(randomKey, randomValue);
112	numBytesToWrite -= keyLength + valueLength;
113	reporter.incrCounter(Counters.BYTES_WRITTEN, 1);
114	reporter.incrCounter(Counters.RECORDS_WRITTEN, 1);
115	if (++itemCount % 200 == 0) {
116	reporter.setStatus("wrote record " + itemCount + ". "
117	+ numBytesToWrite + " bytes left.");
118	}
119	}
120	reporter.setStatus("done with " + itemCount + " records.");
121	}
122
123	@Override
124	public void configure(JobConf job) {
125	numBytesToWrite = job.getLong("test.tmb.bytes_per_map",
126	128 * 1024 * 1024);
127	minKeySize = job.getInt("test.tmb.min_key", 10);
128	keySizeRange = job.getInt("test.tmb.max_key", 10) - minKeySize;
129	minValueSize = job.getInt("test.tmb.min_value", 10);
130	valueSizeRange = job.getInt("test.tmb.max_value", 10) - minValueSize;
131	}
132	}
133
134	/**
135	* Generate input data for the benchmark
136	*/
137	public static void generateInputData(int dataSizePerMap,
138	int numSpillsPerMap,
139	int numMapsPerHost,
140	JobConf masterConf)
141	throws Exception {
142	JobConf job = new JobConf(masterConf, ThreadedMapBenchmark.class);
143	job.setJobName("threaded-map-benchmark-random-writer");
144	job.setJarByClass(ThreadedMapBenchmark.class);
145	job.setInputFormat(UtilsForTests.RandomInputFormat.class);
146	job.setOutputFormat(SequenceFileOutputFormat.class);
147
148	job.setMapperClass(Map.class);
149	job.setReducerClass(IdentityReducer.class);
150
151	job.setOutputKeyClass(BytesWritable.class);
152	job.setOutputValueClass(BytesWritable.class);
153
154	JobClient client = new JobClient(job);
155	ClusterStatus cluster = client.getClusterStatus();
156	long totalDataSize = dataSizePerMap * numMapsPerHost
157	* cluster.getTaskTrackers();
158	job.set("test.tmb.bytes_per_map",
159	String.valueOf(dataSizePerMap * 1024 * 1024));
160	job.setNumReduceTasks(0); // none reduce
161	job.setNumMapTasks(numMapsPerHost * cluster.getTaskTrackers());
162	FileOutputFormat.setOutputPath(job, INPUT_DIR);
163
164	FileSystem fs = FileSystem.get(job);
165	fs.delete(BASE_DIR, true);
166
167	LOG.info("Generating random input for the benchmark");
168	LOG.info("Total data : " + totalDataSize + " mb");
169	LOG.info("Data per map: " + dataSizePerMap + " mb");
170	LOG.info("Number of spills : " + numSpillsPerMap);
171	LOG.info("Number of maps per host : " + numMapsPerHost);
172	LOG.info("Number of hosts : " + cluster.getTaskTrackers());
173
174	JobClient.runJob(job); // generates the input for the benchmark
175	}
176
177	/**
178	* This is the main routine for launching the benchmark. It generates random
179	* input data. The input is non-splittable. Sort is used for benchmarking.
180	* This benchmark reports the effect of having multiple sort and spill
181	* cycles over a single sort and spill.
182	*
183	* @throws IOException
184	*/
185	public int run (String[] args) throws Exception {
186	LOG.info("Starting the benchmark for threaded spills");
187	String version = "ThreadedMapBenchmark.0.0.1";
188	System.out.println(version);
189
190	String usage =
191	"Usage: threadedmapbenchmark " +
192	"[-dataSizePerMap <data size (in mb) per map, default is 128 mb>] " +
193	"[-numSpillsPerMap <number of spills per map, default is 2>] " +
194	"[-numMapsPerHost <number of maps per host, default is 1>]";
195
196	int dataSizePerMap = 128; // in mb
197	int numSpillsPerMap = 2;
198	int numMapsPerHost = 1;
199	JobConf masterConf = new JobConf(getConf());
200
201	for (int i = 0; i < args.length; i++) { // parse command line
202	if (args[i].equals("-dataSizePerMap")) {
203	dataSizePerMap = Integer.parseInt(args[++i]);
204	} else if (args[i].equals("-numSpillsPerMap")) {
205	numSpillsPerMap = Integer.parseInt(args[++i]);
206	} else if (args[i].equals("-numMapsPerHost")) {
207	numMapsPerHost = Integer.parseInt(args[++i]);
208	} else {
209	System.err.println(usage);
210	System.exit(-1);
211	}
212	}
213
214	if (dataSizePerMap < 1 \|\| // verify arguments
215	numSpillsPerMap < 1 \|\|
216	numMapsPerHost < 1)
217	{
218	System.err.println(usage);
219	System.exit(-1);
220	}
221
222	FileSystem fs = null;
223	try {
224	// using random-writer to generate the input data
225	generateInputData(dataSizePerMap, numSpillsPerMap, numMapsPerHost,
226	masterConf);
227
228	// configure job for sorting
229	JobConf job = new JobConf(masterConf, ThreadedMapBenchmark.class);
230	job.setJobName("threaded-map-benchmark-unspilled");
231	job.setJarByClass(ThreadedMapBenchmark.class);
232
233	job.setInputFormat(NonSplitableSequenceFileInputFormat.class);
234	job.setOutputFormat(SequenceFileOutputFormat.class);
235
236	job.setOutputKeyClass(BytesWritable.class);
237	job.setOutputValueClass(BytesWritable.class);
238
239	job.setMapperClass(IdentityMapper.class);
240	job.setReducerClass(IdentityReducer.class);
241
242	FileInputFormat.addInputPath(job, INPUT_DIR);
243	FileOutputFormat.setOutputPath(job, OUTPUT_DIR);
244
245	JobClient client = new JobClient(job);
246	ClusterStatus cluster = client.getClusterStatus();
247	job.setNumMapTasks(numMapsPerHost * cluster.getTaskTrackers());
248	job.setNumReduceTasks(1);
249
250	// set io.sort.mb to avoid spill
251	int ioSortMb = (int)Math.ceil(FACTOR * dataSizePerMap);
252	job.set("io.sort.mb", String.valueOf(ioSortMb));
253	fs = FileSystem.get(job);
254
255	LOG.info("Running sort with 1 spill per map");
256	long startTime = System.currentTimeMillis();
257	JobClient.runJob(job);
258	long endTime = System.currentTimeMillis();
259
260	LOG.info("Total time taken : " + String.valueOf(endTime - startTime)
261	+ " millisec");
262	fs.delete(OUTPUT_DIR, true);
263
264	// set io.sort.mb to have multiple spills
265	JobConf spilledJob = new JobConf(job, ThreadedMapBenchmark.class);
266	ioSortMb = (int)Math.ceil(FACTOR
267	* Math.ceil((double)dataSizePerMap
268	/ numSpillsPerMap));
269	spilledJob.set("io.sort.mb", String.valueOf(ioSortMb));
270	spilledJob.setJobName("threaded-map-benchmark-spilled");
271	spilledJob.setJarByClass(ThreadedMapBenchmark.class);
272
273	LOG.info("Running sort with " + numSpillsPerMap + " spills per map");
274	startTime = System.currentTimeMillis();
275	JobClient.runJob(spilledJob);
276	endTime = System.currentTimeMillis();
277
278	LOG.info("Total time taken : " + String.valueOf(endTime - startTime)
279	+ " millisec");
280	} finally {
281	if (fs != null) {
282	fs.delete(BASE_DIR, true);
283	}
284	}
285	return 0;
286	}
287
288	public static void main(String[] args) throws Exception {
289	int res = ToolRunner.run(new ThreadedMapBenchmark(), args);
290	System.exit(res);
291	}
292	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: proiecte/HadoopJUnit/hadoop-0.20.1/src/test/org/apache/hadoop/mapred/ThreadedMapBenchmark.java @ 120

Download in other formats: