1 | #!/usr/bin/env bash |
---|
2 | |
---|
3 | GRID_DIR=`dirname "$0"` |
---|
4 | GRID_DIR=`cd "$GRID_DIR"; pwd` |
---|
5 | source $GRID_DIR/gridmix-env |
---|
6 | |
---|
7 | # Smaller data set is used by default. |
---|
8 | COMPRESSED_DATA_BYTES=2147483648 |
---|
9 | UNCOMPRESSED_DATA_BYTES=536870912 |
---|
10 | INDIRECT_DATA_BYTES=58720256 |
---|
11 | |
---|
12 | # Number of partitions for output data |
---|
13 | if [ -z ${NUM_MAPS} ] ; then |
---|
14 | NUM_MAPS=100 |
---|
15 | fi |
---|
16 | |
---|
17 | INDIRECT_DATA_FILES=200 |
---|
18 | |
---|
19 | # If the env var USE_REAL_DATASET is set, then use the params to generate the bigger (real) dataset. |
---|
20 | if [ ! -z ${USE_REAL_DATASET} ] ; then |
---|
21 | echo "Using real dataset" |
---|
22 | # 2TB data compressing to approx 500GB |
---|
23 | COMPRESSED_DATA_BYTES=2147483648000 |
---|
24 | # 500GB |
---|
25 | UNCOMPRESSED_DATA_BYTES=536870912000 |
---|
26 | # Default approx 70MB per data file, compressed |
---|
27 | INDIRECT_DATA_BYTES=58720256000 |
---|
28 | fi |
---|
29 | |
---|
30 | ${HADOOP_HOME}/bin/hadoop jar \ |
---|
31 | ${EXAMPLE_JAR} randomtextwriter \ |
---|
32 | -D test.randomtextwrite.total_bytes=${COMPRESSED_DATA_BYTES} \ |
---|
33 | -D test.randomtextwrite.bytes_per_map=$((${COMPRESSED_DATA_BYTES} / ${NUM_MAPS})) \ |
---|
34 | -D test.randomtextwrite.min_words_key=5 \ |
---|
35 | -D test.randomtextwrite.max_words_key=10 \ |
---|
36 | -D test.randomtextwrite.min_words_value=100 \ |
---|
37 | -D test.randomtextwrite.max_words_value=10000 \ |
---|
38 | -D mapred.output.compress=true \ |
---|
39 | -D mapred.map.output.compression.type=BLOCK \ |
---|
40 | -outFormat org.apache.hadoop.mapred.SequenceFileOutputFormat \ |
---|
41 | ${VARCOMPSEQ} & |
---|
42 | |
---|
43 | ${HADOOP_HOME}/bin/hadoop jar \ |
---|
44 | ${EXAMPLE_JAR} randomtextwriter \ |
---|
45 | -D test.randomtextwrite.total_bytes=${COMPRESSED_DATA_BYTES} \ |
---|
46 | -D test.randomtextwrite.bytes_per_map=$((${COMPRESSED_DATA_BYTES} / ${NUM_MAPS})) \ |
---|
47 | -D test.randomtextwrite.min_words_key=5 \ |
---|
48 | -D test.randomtextwrite.max_words_key=5 \ |
---|
49 | -D test.randomtextwrite.min_words_value=100 \ |
---|
50 | -D test.randomtextwrite.max_words_value=100 \ |
---|
51 | -D mapred.output.compress=true \ |
---|
52 | -D mapred.map.output.compression.type=BLOCK \ |
---|
53 | -outFormat org.apache.hadoop.mapred.SequenceFileOutputFormat \ |
---|
54 | ${FIXCOMPSEQ} & |
---|
55 | |
---|
56 | ${HADOOP_HOME}/bin/hadoop jar \ |
---|
57 | ${EXAMPLE_JAR} randomtextwriter \ |
---|
58 | -D test.randomtextwrite.total_bytes=${UNCOMPRESSED_DATA_BYTES} \ |
---|
59 | -D test.randomtextwrite.bytes_per_map=$((${UNCOMPRESSED_DATA_BYTES} / ${NUM_MAPS})) \ |
---|
60 | -D test.randomtextwrite.min_words_key=1 \ |
---|
61 | -D test.randomtextwrite.max_words_key=10 \ |
---|
62 | -D test.randomtextwrite.min_words_value=0 \ |
---|
63 | -D test.randomtextwrite.max_words_value=200 \ |
---|
64 | -D mapred.output.compress=false \ |
---|
65 | -outFormat org.apache.hadoop.mapred.TextOutputFormat \ |
---|
66 | ${VARINFLTEXT} & |
---|
67 | |
---|
68 | ${HADOOP_HOME}/bin/hadoop jar \ |
---|
69 | ${EXAMPLE_JAR} randomtextwriter \ |
---|
70 | -D test.randomtextwrite.total_bytes=${INDIRECT_DATA_BYTES} \ |
---|
71 | -D test.randomtextwrite.bytes_per_map=$((${INDIRECT_DATA_BYTES} / ${INDIRECT_DATA_FILES})) \ |
---|
72 | -D test.randomtextwrite.min_words_key=5 \ |
---|
73 | -D test.randomtextwrite.max_words_key=5 \ |
---|
74 | -D test.randomtextwrite.min_words_value=20 \ |
---|
75 | -D test.randomtextwrite.max_words_value=20 \ |
---|
76 | -D mapred.output.compress=true \ |
---|
77 | -D mapred.map.output.compression.type=BLOCK \ |
---|
78 | -outFormat org.apache.hadoop.mapred.TextOutputFormat \ |
---|
79 | ${FIXCOMPTEXT} & |
---|