1 | #!/usr/bin/env bash |
---|
2 | |
---|
3 | ############################################################## |
---|
4 | # Licensed to the Apache Software Foundation (ASF) under one |
---|
5 | # or more contributor license agreements. See the NOTICE file |
---|
6 | # distributed with this work for additional information |
---|
7 | # regarding copyright ownership. The ASF licenses this file |
---|
8 | # to you under the Apache License, Version 2.0 (the |
---|
9 | # "License"); you may not use this file except in compliance |
---|
10 | # with the License. You may obtain a copy of the License at |
---|
11 | # |
---|
12 | # http://www.apache.org/licenses/LICENSE-2.0 |
---|
13 | # |
---|
14 | # Unless required by applicable law or agreed to in writing, software |
---|
15 | # distributed under the License is distributed on an "AS IS" BASIS, |
---|
16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
---|
17 | # See the License for the specific language governing permissions and |
---|
18 | # limitations under the License. |
---|
19 | # |
---|
20 | ##################################################################### |
---|
21 | |
---|
22 | GRID_DIR=`dirname "$0"` |
---|
23 | GRID_DIR=`cd "$GRID_DIR"; pwd` |
---|
24 | source $GRID_DIR/gridmix-env-2 |
---|
25 | |
---|
26 | # Smaller data set is used by default. |
---|
27 | COMPRESSED_DATA_BYTES=2147483648 |
---|
28 | UNCOMPRESSED_DATA_BYTES=536870912 |
---|
29 | |
---|
30 | # Number of partitions for output data |
---|
31 | NUM_MAPS=100 |
---|
32 | |
---|
33 | # If the env var USE_REAL_DATASET is set, then use the params to generate the bigger (real) dataset. |
---|
34 | if [ ! -z ${USE_REAL_DATASET} ] ; then |
---|
35 | echo "Using real dataset" |
---|
36 | NUM_MAPS=492 |
---|
37 | # 2TB data compressing to approx 500GB |
---|
38 | COMPRESSED_DATA_BYTES=2147483648000 |
---|
39 | # 500GB |
---|
40 | UNCOMPRESSED_DATA_BYTES=536870912000 |
---|
41 | fi |
---|
42 | |
---|
43 | ## Data sources |
---|
44 | export GRID_MIX_DATA=/gridmix/data |
---|
45 | # Variable length key, value compressed SequenceFile |
---|
46 | export VARCOMPSEQ=${GRID_MIX_DATA}/WebSimulationBlockCompressed |
---|
47 | # Fixed length key, value compressed SequenceFile |
---|
48 | export FIXCOMPSEQ=${GRID_MIX_DATA}/MonsterQueryBlockCompressed |
---|
49 | # Variable length key, value uncompressed Text File |
---|
50 | export VARINFLTEXT=${GRID_MIX_DATA}/SortUncompressed |
---|
51 | # Fixed length key, value compressed Text File |
---|
52 | export FIXCOMPTEXT=${GRID_MIX_DATA}/EntropySimulationCompressed |
---|
53 | |
---|
54 | ${HADOOP_HOME}/bin/hadoop jar \ |
---|
55 | ${EXAMPLE_JAR} randomtextwriter \ |
---|
56 | -D test.randomtextwrite.total_bytes=${COMPRESSED_DATA_BYTES} \ |
---|
57 | -D test.randomtextwrite.bytes_per_map=$((${COMPRESSED_DATA_BYTES} / ${NUM_MAPS})) \ |
---|
58 | -D test.randomtextwrite.min_words_key=5 \ |
---|
59 | -D test.randomtextwrite.max_words_key=10 \ |
---|
60 | -D test.randomtextwrite.min_words_value=100 \ |
---|
61 | -D test.randomtextwrite.max_words_value=10000 \ |
---|
62 | -D mapred.output.compress=true \ |
---|
63 | -D mapred.map.output.compression.type=BLOCK \ |
---|
64 | -outFormat org.apache.hadoop.mapred.SequenceFileOutputFormat \ |
---|
65 | ${VARCOMPSEQ} & |
---|
66 | |
---|
67 | |
---|
68 | ${HADOOP_HOME}/bin/hadoop jar \ |
---|
69 | ${EXAMPLE_JAR} randomtextwriter \ |
---|
70 | -D test.randomtextwrite.total_bytes=${COMPRESSED_DATA_BYTES} \ |
---|
71 | -D test.randomtextwrite.bytes_per_map=$((${COMPRESSED_DATA_BYTES} / ${NUM_MAPS})) \ |
---|
72 | -D test.randomtextwrite.min_words_key=5 \ |
---|
73 | -D test.randomtextwrite.max_words_key=5 \ |
---|
74 | -D test.randomtextwrite.min_words_value=100 \ |
---|
75 | -D test.randomtextwrite.max_words_value=100 \ |
---|
76 | -D mapred.output.compress=true \ |
---|
77 | -D mapred.map.output.compression.type=BLOCK \ |
---|
78 | -outFormat org.apache.hadoop.mapred.SequenceFileOutputFormat \ |
---|
79 | ${FIXCOMPSEQ} & |
---|
80 | |
---|
81 | |
---|
82 | ${HADOOP_HOME}/bin/hadoop jar \ |
---|
83 | ${EXAMPLE_JAR} randomtextwriter \ |
---|
84 | -D test.randomtextwrite.total_bytes=${UNCOMPRESSED_DATA_BYTES} \ |
---|
85 | -D test.randomtextwrite.bytes_per_map=$((${UNCOMPRESSED_DATA_BYTES} / ${NUM_MAPS})) \ |
---|
86 | -D test.randomtextwrite.min_words_key=1 \ |
---|
87 | -D test.randomtextwrite.max_words_key=10 \ |
---|
88 | -D test.randomtextwrite.min_words_value=0 \ |
---|
89 | -D test.randomtextwrite.max_words_value=200 \ |
---|
90 | -D mapred.output.compress=false \ |
---|
91 | -outFormat org.apache.hadoop.mapred.TextOutputFormat \ |
---|
92 | ${VARINFLTEXT} & |
---|
93 | |
---|
94 | |
---|