source: proiecte/HadoopJUnit/hadoop-0.20.1/src/benchmarks/gridmix/generateData.sh @ 120

Last change on this file since 120 was 120, checked in by (none), 14 years ago

Added the mail files for the Hadoop JUNit Project

  • Property svn:executable set to *
File size: 2.9 KB
Line 
1#!/usr/bin/env bash
2
3GRID_DIR=`dirname "$0"`
4GRID_DIR=`cd "$GRID_DIR"; pwd`
5source $GRID_DIR/gridmix-env
6
7# Smaller data set is used by default.
8COMPRESSED_DATA_BYTES=2147483648
9UNCOMPRESSED_DATA_BYTES=536870912
10INDIRECT_DATA_BYTES=58720256
11
12# Number of partitions for output data
13if [ -z ${NUM_MAPS} ] ; then
14  NUM_MAPS=100
15fi
16
17INDIRECT_DATA_FILES=200
18
19# If the env var USE_REAL_DATASET is set, then use the params to generate the bigger (real) dataset.
20if [ ! -z ${USE_REAL_DATASET} ] ; then
21  echo "Using real dataset"
22  # 2TB data compressing to approx 500GB
23  COMPRESSED_DATA_BYTES=2147483648000
24  # 500GB
25  UNCOMPRESSED_DATA_BYTES=536870912000
26  # Default approx 70MB per data file, compressed
27  INDIRECT_DATA_BYTES=58720256000 
28fi
29
30${HADOOP_HOME}/bin/hadoop jar \
31  ${EXAMPLE_JAR} randomtextwriter \
32  -D test.randomtextwrite.total_bytes=${COMPRESSED_DATA_BYTES} \
33  -D test.randomtextwrite.bytes_per_map=$((${COMPRESSED_DATA_BYTES} / ${NUM_MAPS})) \
34  -D test.randomtextwrite.min_words_key=5 \
35  -D test.randomtextwrite.max_words_key=10 \
36  -D test.randomtextwrite.min_words_value=100 \
37  -D test.randomtextwrite.max_words_value=10000 \
38  -D mapred.output.compress=true \
39  -D mapred.map.output.compression.type=BLOCK \
40  -outFormat org.apache.hadoop.mapred.SequenceFileOutputFormat \
41  ${VARCOMPSEQ} &
42
43${HADOOP_HOME}/bin/hadoop jar \
44  ${EXAMPLE_JAR} randomtextwriter \
45  -D test.randomtextwrite.total_bytes=${COMPRESSED_DATA_BYTES} \
46  -D test.randomtextwrite.bytes_per_map=$((${COMPRESSED_DATA_BYTES} / ${NUM_MAPS})) \
47  -D test.randomtextwrite.min_words_key=5 \
48  -D test.randomtextwrite.max_words_key=5 \
49  -D test.randomtextwrite.min_words_value=100 \
50  -D test.randomtextwrite.max_words_value=100 \
51  -D mapred.output.compress=true \
52  -D mapred.map.output.compression.type=BLOCK \
53  -outFormat org.apache.hadoop.mapred.SequenceFileOutputFormat \
54  ${FIXCOMPSEQ} &
55
56${HADOOP_HOME}/bin/hadoop jar \
57  ${EXAMPLE_JAR} randomtextwriter \
58  -D test.randomtextwrite.total_bytes=${UNCOMPRESSED_DATA_BYTES} \
59  -D test.randomtextwrite.bytes_per_map=$((${UNCOMPRESSED_DATA_BYTES} / ${NUM_MAPS})) \
60  -D test.randomtextwrite.min_words_key=1 \
61  -D test.randomtextwrite.max_words_key=10 \
62  -D test.randomtextwrite.min_words_value=0 \
63  -D test.randomtextwrite.max_words_value=200 \
64  -D mapred.output.compress=false \
65  -outFormat org.apache.hadoop.mapred.TextOutputFormat \
66  ${VARINFLTEXT} &
67
68${HADOOP_HOME}/bin/hadoop jar \
69  ${EXAMPLE_JAR} randomtextwriter \
70  -D test.randomtextwrite.total_bytes=${INDIRECT_DATA_BYTES} \
71  -D test.randomtextwrite.bytes_per_map=$((${INDIRECT_DATA_BYTES} / ${INDIRECT_DATA_FILES})) \
72  -D test.randomtextwrite.min_words_key=5 \
73  -D test.randomtextwrite.max_words_key=5 \
74  -D test.randomtextwrite.min_words_value=20 \
75  -D test.randomtextwrite.max_words_value=20 \
76  -D mapred.output.compress=true \
77  -D mapred.map.output.compression.type=BLOCK \
78  -outFormat org.apache.hadoop.mapred.TextOutputFormat \
79  ${FIXCOMPTEXT} &
Note: See TracBrowser for help on using the repository browser.