/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.mapreduce; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.RawComparator; import org.apache.hadoop.mapred.RawKeyValueIterator; /** * Reduces a set of intermediate values which share a key to a smaller set of * values. * *

Reducer implementations * can access the {@link Configuration} for the job via the * {@link JobContext#getConfiguration()} method.

Reducer has 3 primary phases:

* *
Shuffle
* *
The Reducer copies the sorted output from each * {@link Mapper} using HTTP across the network.
*
*
Sort
* *
The framework merge sorts Reducer inputs by * keys * (since different Mappers may have output the same key).
* *
The shuffle and sort phases occur simultaneously i.e. while outputs are * being fetched they are merged.
* *
SecondarySort
* *
To achieve a secondary sort on the values returned by the value * iterator, the application should extend the key with the secondary * key and define a grouping comparator. The keys will be sorted using the * entire key, but will be grouped using the grouping comparator to decide * which keys and values are sent in the same call to reduce.The grouping * comparator is specified via * {@link Job#setGroupingComparatorClass(Class)}. The sort order is * controlled by * {@link Job#setSortComparatorClass(Class)}.
* * * For example, say that you want to find duplicate web pages and tag them * all with the url of the "best" known example. You would set up the job * like: *
- Map Input Key: url
- Map Input Value: document
- Map Output Key: document checksum, url pagerank
- Map Output Value: url
- Partitioner: by checksum
- OutputKeyComparator: by checksum and then decreasing pagerank
- OutputValueGroupingComparator: by checksum
*
*
Reduce
* *
In this phase the * {@link #reduce(Object, Iterable, Context)} * method is called for each <key, (collection of values)> in * the sorted inputs.
*
The output of the reduce task is typically written to a * {@link RecordWriter} via * {@link Context#write(Object, Object)}.
*

* *

The output of the Reducer is not re-sorted.

* *

Example:

 * public class IntSumReducer extends Reducer {
 *   private IntWritable result = new IntWritable();
 * 
 *   public void reduce(Key key, Iterable values, 
 *                      Context context) throws IOException {
 *     int sum = 0;
 *     for (IntWritable val : values) {
 *       sum += val.get();
 *     }
 *     result.set(sum);
 *     context.collect(key, result);
 *   }
 * }
 *

* * @see Mapper * @see Partitioner */ public class Reducer { public class Context extends ReduceContext { public Context(Configuration conf, TaskAttemptID taskid, RawKeyValueIterator input, Counter inputCounter, RecordWriter output, OutputCommitter committer, StatusReporter reporter, RawComparator comparator, Class keyClass, Class valueClass ) throws IOException, InterruptedException { super(conf, taskid, input, inputCounter, output, committer, reporter, comparator, keyClass, valueClass); } } /** * Called once at the start of the task. */ protected void setup(Context context ) throws IOException, InterruptedException { // NOTHING } /** * This method is called once for each key. Most applications will define * their reduce class by overriding this method. The default implementation * is an identity function. */ @SuppressWarnings("unchecked") protected void reduce(KEYIN key, Iterable values, Context context ) throws IOException, InterruptedException { for(VALUEIN value: values) { context.write((KEYOUT) key, (VALUEOUT) value); } } /** * Called once at the end of the task. */ protected void cleanup(Context context ) throws IOException, InterruptedException { // NOTHING } /** * Advanced application writers can use the * {@link #run(org.apache.hadoop.mapreduce.Reducer.Context)} method to * control how the reduce task works. */ public void run(Context context) throws IOException, InterruptedException { setup(context); while (context.nextKey()) { reduce(context.getCurrentKey(), context.getValues(), context); } cleanup(context); } }

Shuffle

Sort

SecondarySort

Reduce