1

I use this code below to get output result like ( Key , Value )

Apple 12
Bee 345 
Cat 123

What I want is descending sorted by value ( 345 ) and place them before the key ( Value , Key )

345 Bee
123 Cat
12 Apple

I found there are something called "secondary sorted" not going to lie but I'm so lost - I tried to change .. context.write(key, result); but failed miserably. I'm new to Hadoop and not sure how can I start to tackle this problem. Any recommendation would be appreciated. Which function do I need to change ? or which class do I need modify ?

here 'are my classes :

package org.apache.hadoop.examples;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class WordCount {

  public static class TokenizerMapper 
       extends Mapper<Object, Text, Text, IntWritable>{

    private final static IntWritable one = new IntWritable(1);
    private Text word = new Text();

    public void map(Object key, Text value, Context context
                    ) throws IOException, InterruptedException {
      StringTokenizer itr = new StringTokenizer(value.toString());
      while (itr.hasMoreTokens()) {
        word.set(itr.nextToken());
        context.write(word, one);
      }
    }
  }

  public static class IntSumReducer 
       extends Reducer<Text,IntWritable,Text,IntWritable> {
    private IntWritable result = new IntWritable();

    public void reduce(Text key, Iterable<IntWritable> values, 
                       Context context
                       ) throws IOException, InterruptedException {
      int sum = 0;
      for (IntWritable val : values) {
        sum += val.get();
      }
      result.set(sum);
      context.write(key, result);
    }
  }

  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 2) {
      System.err.println("Usage: wordcount <in> [<in>...] <out>");
      System.exit(2);
    }
    Job job = new Job(conf, "word count");
    job.setJarByClass(WordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    for (int i = 0; i < otherArgs.length - 1; ++i) {
      FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
    }
    FileOutputFormat.setOutputPath(job,
      new Path(otherArgs[otherArgs.length - 1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }
}
JPC
  • 5,063
  • 20
  • 71
  • 100
  • see these links.http://stackoverflow.com/questions/18403857/how-to-sort-word-count-by-value-in-hadoop and http://stackoverflow.com/questions/2550784/sorted-word-count-using-hadoop-mapreduce..hope it helps – Manjunath H Feb 28 '15 at 19:03

1 Answers1

2

You have been able to do word count correctly.

You will need second map only job to perform the second requirement of descending sort and swapping of key value

  1. Use DecreasingComparator as sort comparator
  2. Use InverseMapper to swap key and values
  3. Use Identity Reducer i.e. Reducer.class - In case of Identity Reducer no aggregation will happen ( as each value is output individually for key )
  4. Set number of reduce tasks to 1 or use TotalOderPartitioner
BoltClock
  • 700,868
  • 160
  • 1,392
  • 1,356