Sort in order of value Hadoop

Question

I am trying to sort out out in order of value and trying to run two mappers and reducers, but when the second job starts it fails and says:

14/12/21 18:43:35 ERROR security.UserGroupInformation: PriviledgedActionException as:cloudera (auth:SIMPLE) cause:org.apache.hadoop.mapred.FileAlreadyExistsException: Output directory hdfs://localhost.localdomain:8020/user/cloudera/wordcount/output already exists Exception in thread "main" org.apache.hadoop.mapred.FileAlreadyExistsException: Output directory hdfs://localhost.localdomain:8020/user/cloudera/wordcount/output already exists

Here is my code:

package org.myorg;

import java.io.IOException;
import java.util.*;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.mapreduce.Job;

public class WordCount {

public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {

    private final static IntWritable one = new IntWritable(1);
    private Text word = new Text();

    public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
        String line = value.toString();
        StringTokenizer tokenizer = new StringTokenizer(line);
        while (tokenizer.hasMoreTokens()) {
            word.set(tokenizer.nextToken());
            output.collect(word, one);
        }
    }
}

public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {

    public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
        int sum = 0;
        while (values.hasNext()) {
            sum += values.next().get();
        }
        output.collect(key, new IntWritable(sum));
    }
}

class Map1 extends MapReduceBase implements Mapper<Object, Text, IntWritable, Text> {

    public void map(Object key, Text value, OutputCollector<IntWritable, Text> collector, Reporter arg3) throws IOException {
        String line = value.toString();
        StringTokenizer stringTokenizer = new StringTokenizer(line);
        {
            int number = 999;
            String word = "empty";

            if (stringTokenizer.hasMoreTokens()) {
                String str0 = stringTokenizer.nextToken();
                word = str0.trim();
            }

            if (stringTokenizer.hasMoreElements()) {
                String str1 = stringTokenizer.nextToken();
                number = Integer.parseInt(str1.trim());
            }
            collector.collect(new IntWritable(number), new Text(word));
        }

    }

}

class Reduce1 extends MapReduceBase implements Reducer<IntWritable, Text, IntWritable, Text> {

    public void reduce(IntWritable key, Iterator<Text> values, OutputCollector<IntWritable, Text> arg2, Reporter arg3) throws IOException {
        while ((values.hasNext())) {
            arg2.collect(key, values.next());
        }
    }

}

public static void main(String[] args) throws Exception {
    JobConf conf = new JobConf(WordCount.class);
    conf.setJobName("wordCount");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path("wordcount/output"));

//JobClient.runJob(conf);
    //------------------------------------------------------------------
    JobConf conf2 = new JobConf(WordCount.class);
    conf2.setJobName("WordCount1");

    conf2.setOutputKeyClass(Text.class);
    conf2.setOutputValueClass(IntWritable.class);

    conf2.setMapperClass(Map1.class);
    conf2.setCombinerClass(Reduce1.class);
    conf2.setReducerClass(Reduce1.class);

    conf2.setInputFormat(TextInputFormat.class);
    conf2.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf2, new Path("wordcount/output/part-00000"));
    FileOutputFormat.setOutputPath(conf2, new Path(args[1]));

    Job job1 = new Job(conf);
    Job job2 = new Job(conf2);

    job1.submit();
if (job1.waitForCompletion(true)) {
    job2.submit();
    job2.waitForCompletion(true);
    }

   }
 }

I have tried changing the path a few times even creating a new direct called tmp but no luck.

Current Error Message :

    14/12/21 19:58:12 INFO mapred.JobClient: Running job: job_201412211623_0042
    14/12/21 19:58:13 INFO mapred.JobClient:  map 0% reduce 0%
    14/12/21 19:58:35 INFO mapred.JobClient: Task Id :      attempt_201412211623_0042_m_000001_0, Status : FAILED
    java.lang.RuntimeException: Error in configuring object
    at org.apache.hadoop.util.ReflectionUtils.setJobConf(ReflectionUtils.java:109)
    at org.apache.hadoop.util.ReflectionUtils.setConf(ReflectionUtils.java:75)
    at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:133)
    at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:413)
    at org.apache.hadoop.mapred.MapTask.run(MapTask.java:332)
    at org.apache.hadoop.mapred.Child$4.run(Child.java:268)
    at java.security.AccessController.doPrivileged(Native Method)
    at javax.security.auth.Subject.doAs(Subject.java:396)
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1438)
    at org.apache.hadoop.mapred.Child.main(Child.java:262)
Caused by: java.lang.reflect.InvocationTargetException
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.ja
14/12/21 19:58:35 INFO mapred.JobClient: Task Id : attempt_201412211623_0042_m_000000_0, Status : FAILED
java.lang.RuntimeException: Error in configuring object
    at org.apache.hadoop.util.ReflectionUtils.setJobConf(ReflectionUtils.java:109)
    at org.apache.hadoop.util.ReflectionUtils.setConf(ReflectionUtils.java:75)
    at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:133)
    at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:413)
    at org.apache.hadoop.mapred.MapTask.run(MapTask.java:332)
    at org.apache.hadoop.mapred.Child$4.run(Child.java:268)
    at java.security.AccessController.doPrivileged(Native Method)
    at javax.security.auth.Subject.doAs(Subject.java:396)
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1438)
    at org.apache.hadoop.mapred.Child.main(Child.java:262)
Caused by: java.lang.reflect.InvocationTargetException
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.ja
14/12/21 19:58:54 INFO mapred.JobClient: Task Id : attempt_201412211623_0042_m_000001_1, Status : FAILED
java.lang.RuntimeException: Error in configuring object

i have tried changing the path afew times even creating a new direct called tmp but no luck — Roland, Dec 22 '14 at 02:53
sorry i dont get what you mean am really bad at this hadoop stuff, like really bad > — Roland, Dec 22 '14 at 03:05
all i know is that the output of the first map-reduce should be the input of the second map reduce but — Roland, Dec 22 '14 at 03:06

score 0 · Answer 1 · answered Dec 22 '14 at 03:13

0

All the error says is the wordcount/output directory already exists. I see that you hard coded the value for the output directory for the first MR job (FileOutputFormat.setOutputPath(conf, new Path("wordcount/output"));).

If you have this directory (output) already existing the job will fail as it prevents you from overriding stuff. Try removing that directory and running the job with new directory.

answered Dec 22 '14 at 03:13

nitishagar

9,038
3
28
40

when i change the directories like this Job1: FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path("tmp")); Job2: FileInputFormat.setInputPaths(conf2, new Path("tmp/part-00000")); FileOutputFormat.setOutputPath(conf2, new Path(args[1])); I get an error that says: java.lang.RuntimeException: Error in configuring object – Roland Dec 22 '14 at 03:29
Is the ` FileOutputFormat.setOutputPath(conf2, new Path(args[1]));` args[1] also the same as `tmp` in the above case ... – nitishagar Dec 22 '14 at 03:30
so i should make a completly new directory like temp/tempfiles ? and store the first output there ? – Roland Dec 22 '14 at 03:34
the system creates that for you once running the job. You just need to specify the directory. Say you specify `outputTemp`, don't create it it will be created once the output needs to be populated. – nitishagar Dec 22 '14 at 03:37
still not getting anything that same error. i really have no clue anymore. java.lang.RuntimeException: Error in configuring object Thank you for the help tho – Roland Dec 22 '14 at 03:54
You still get the `FileAlreadyExistsException` or is it a new exception – nitishagar Dec 22 '14 at 03:56
No getting a new exception now saying "java.lang.RuntimeException: Error in configuring object" – Roland Dec 22 '14 at 04:02
Can you paste the full error message in the question. – nitishagar Dec 22 '14 at 04:03

score 0 · Answer 2 · edited May 23 '17 at 11:57

0

Your 2 different reducers job may be trying to write at same location. In hdfs we can not update or overwrite. You need to delete exiting file, dir location if you want to write at same location again.

Following are some useful references

chaining-multiple-mapreduce-jobs-in-hadoop

job chaining

edited May 23 '17 at 11:57

Community

1
1

answered Dec 22 '14 at 03:20

Sandy

142
8

score 0 · Accepted Answer · answered Dec 22 '14 at 07:35

I suggest you to use new API

This example is based on new API

public class ChainJobs extends Configured implements Tool {

 private static final String OUTPUT_PATH = "intermediate_output";

 @Override
 public int run(String[] args) throws Exception {
  /*
   * Job 1
   */
  Configuration conf = getConf();
  FileSystem fs = FileSystem.get(conf);
  Job job = new Job(conf, "Job1");
  job.setJarByClass(ChainJobs.class);

  job.setMapperClass(MyMapper1.class);
  job.setReducerClass(MyReducer1.class);

  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(IntWritable.class);

  job.setInputFormatClass(TextInputFormat.class);
  job.setOutputFormatClass(TextOutputFormat.class);

  TextInputFormat.addInputPath(job, new Path(args[0]));
  TextOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH));

  job.waitForCompletion(true);

  /*
   * Job 2
   */
  Configuration conf2 = getConf();
  Job job2 = new Job(conf2, "Job 2");
  job2.setJarByClass(ChainJobs.class);

  job2.setMapperClass(MyMapper2.class);
  job2.setReducerClass(MyReducer2.class);

  job2.setOutputKeyClass(Text.class);
  job2.setOutputValueClass(Text.class);

  job2.setInputFormatClass(TextInputFormat.class);
  job2.setOutputFormatClass(TextOutputFormat.class);

  TextInputFormat.addInputPath(job2, new Path(OUTPUT_PATH));
  TextOutputFormat.setOutputPath(job2, new Path(args[1]));

  return job2.waitForCompletion(true) ? 0 : 1;
 }

private static final String OUTPUT_PATH = "intermediate_output"; is defined for 1st jobs output and it will be the input to second job.

Refer this

Hope this helps.

Sort in order of value Hadoop

3 Answers3