Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

generateGridmix2data.sh @ 120

Last change on this file since 120 was 120, checked in by (none), 14 years ago
Added the mail files for the Hadoop JUNit Project
Property svn:executable set to ``*
File size: 3.6 KB

Line
1	#!/usr/bin/env bash
2
3	##############################################################
4	# Licensed to the Apache Software Foundation (ASF) under one
5	# or more contributor license agreements. See the NOTICE file
6	# distributed with this work for additional information
7	# regarding copyright ownership. The ASF licenses this file
8	# to you under the Apache License, Version 2.0 (the
9	# "License"); you may not use this file except in compliance
10	# with the License. You may obtain a copy of the License at
11	#
12	# http://www.apache.org/licenses/LICENSE-2.0
13	#
14	# Unless required by applicable law or agreed to in writing, software
15	# distributed under the License is distributed on an "AS IS" BASIS,
16	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17	# See the License for the specific language governing permissions and
18	# limitations under the License.
19	#
20	#####################################################################
21
22	GRID_DIR=`dirname "$0"`
23	GRID_DIR=`cd "$GRID_DIR"; pwd`
24	source $GRID_DIR/gridmix-env-2
25
26	# Smaller data set is used by default.
27	COMPRESSED_DATA_BYTES=2147483648
28	UNCOMPRESSED_DATA_BYTES=536870912
29
30	# Number of partitions for output data
31	NUM_MAPS=100
32
33	# If the env var USE_REAL_DATASET is set, then use the params to generate the bigger (real) dataset.
34	if [ ! -z ${USE_REAL_DATASET} ] ; then
35	echo "Using real dataset"
36	NUM_MAPS=492
37	# 2TB data compressing to approx 500GB
38	COMPRESSED_DATA_BYTES=2147483648000
39	# 500GB
40	UNCOMPRESSED_DATA_BYTES=536870912000
41	fi
42
43	## Data sources
44	export GRID_MIX_DATA=/gridmix/data
45	# Variable length key, value compressed SequenceFile
46	export VARCOMPSEQ=${GRID_MIX_DATA}/WebSimulationBlockCompressed
47	# Fixed length key, value compressed SequenceFile
48	export FIXCOMPSEQ=${GRID_MIX_DATA}/MonsterQueryBlockCompressed
49	# Variable length key, value uncompressed Text File
50	export VARINFLTEXT=${GRID_MIX_DATA}/SortUncompressed
51	# Fixed length key, value compressed Text File
52	export FIXCOMPTEXT=${GRID_MIX_DATA}/EntropySimulationCompressed
53
54	${HADOOP_HOME}/bin/hadoop jar \
55	${EXAMPLE_JAR} randomtextwriter \
56	-D test.randomtextwrite.total_bytes=${COMPRESSED_DATA_BYTES} \
57	-D test.randomtextwrite.bytes_per_map=$((${COMPRESSED_DATA_BYTES} / ${NUM_MAPS})) \
58	-D test.randomtextwrite.min_words_key=5 \
59	-D test.randomtextwrite.max_words_key=10 \
60	-D test.randomtextwrite.min_words_value=100 \
61	-D test.randomtextwrite.max_words_value=10000 \
62	-D mapred.output.compress=true \
63	-D mapred.map.output.compression.type=BLOCK \
64	-outFormat org.apache.hadoop.mapred.SequenceFileOutputFormat \
65	${VARCOMPSEQ} &
66
67
68	${HADOOP_HOME}/bin/hadoop jar \
69	${EXAMPLE_JAR} randomtextwriter \
70	-D test.randomtextwrite.total_bytes=${COMPRESSED_DATA_BYTES} \
71	-D test.randomtextwrite.bytes_per_map=$((${COMPRESSED_DATA_BYTES} / ${NUM_MAPS})) \
72	-D test.randomtextwrite.min_words_key=5 \
73	-D test.randomtextwrite.max_words_key=5 \
74	-D test.randomtextwrite.min_words_value=100 \
75	-D test.randomtextwrite.max_words_value=100 \
76	-D mapred.output.compress=true \
77	-D mapred.map.output.compression.type=BLOCK \
78	-outFormat org.apache.hadoop.mapred.SequenceFileOutputFormat \
79	${FIXCOMPSEQ} &
80
81
82	${HADOOP_HOME}/bin/hadoop jar \
83	${EXAMPLE_JAR} randomtextwriter \
84	-D test.randomtextwrite.total_bytes=${UNCOMPRESSED_DATA_BYTES} \
85	-D test.randomtextwrite.bytes_per_map=$((${UNCOMPRESSED_DATA_BYTES} / ${NUM_MAPS})) \
86	-D test.randomtextwrite.min_words_key=1 \
87	-D test.randomtextwrite.max_words_key=10 \
88	-D test.randomtextwrite.min_words_value=0 \
89	-D test.randomtextwrite.max_words_value=200 \
90	-D mapred.output.compress=false \
91	-outFormat org.apache.hadoop.mapred.TextOutputFormat \
92	${VARINFLTEXT} &
93
94

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: proiecte/HadoopJUnit/hadoop-0.20.1/src/benchmarks/gridmix2/generateGridmix2data.sh @ 120

Download in other formats: