我使用hadoop的是hadoop1.1.2，而很多公司也在使用hadoop0.2x版本，因此市面上的hadoop资料版本不一，为了扩充自己的知识面，MapReduce的新旧api进行了比较研究。

　　hadoop版本1.x的包一般是mapreduce
　　hadoop版本0.x的包一般是mapred

我们还是以单词统计为例进行研究，代码如下，如代码1.1所示：

package old;

import java.io.IOException;

import java.net.URI;

import java.util.Iterator;

import mapreduce.WordCountApp;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapred.FileInputFormat;

import org.apache.hadoop.mapred.FileOutputFormat;

import org.apache.hadoop.mapred.JobClient;

import org.apache.hadoop.mapred.JobConf;

import org.apache.hadoop.mapred.MapReduceBase;

import org.apache.hadoop.mapred.Mapper;

import org.apache.hadoop.mapred.OutputCollector;

import org.apache.hadoop.mapred.Reducer;

import org.apache.hadoop.mapred.Reporter;

/**

 * hadoop版本1.x的包一般是mapreduce

 * hadoop版本0.x的包一般是mapred

 *

 */

public class OldAPP {

    static final String INPUT_PATH = "hdfs://hadoop:9000/hello";

    static final String OUT_PATH = "hdfs://hadoop:9000/out";

    /**

     * 改动：

     * 1.不再使用Job，而是使用JobConf

     * 2.类的包名不再使用mapreduce，而是使用mapred

     * 3.不再使用job.waitForCompletion(true)提交作业，而是使用JobClient.runJob(job);

     *

     */

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();

        final FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), conf);

        final Path outPath = new Path(OUT_PATH);

        if(fileSystem.exists(outPath)){

            fileSystem.delete(outPath, true);

        }

        final JobConf job = new JobConf(conf , WordCountApp.class);

        //1.1指定读取的文件位于哪里

        FileInputFormat.setInputPaths(job, INPUT_PATH);

        //指定如何对输入文件进行格式化，把输入文件每一行解析成键值对

        //job.setInputFormatClass(TextInputFormat.class);

        //1.2 指定自定义的map类

        job.setMapperClass(MyMapper.class);

        //map输出的<k,v>类型。如果<k3,v3>的类型与<k2,v2>类型一致，则可以省略

        //job.setMapOutputKeyClass(Text.class);

        //job.setMapOutputValueClass(LongWritable.class);

        //1.3 分区

        //job.setPartitionerClass(HashPartitioner.class);

        //有一个reduce任务运行

        //job.setNumReduceTasks(1);

        //1.4 TODO 排序、分组

        //1.5 TODO 规约

        //2.2 指定自定义reduce类

        job.setReducerClass(MyReducer.class);

        //指定reduce的输出类型

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(LongWritable.class);

        //2.3 指定写出到哪里

        FileOutputFormat.setOutputPath(job, outPath);

        //指定输出文件的格式化类

        //job.setOutputFormatClass(TextOutputFormat.class);

        //把job提交给JobTracker运行

        JobClient.runJob(job);

    }

    /**

     * 新api:extends Mapper

     * 老api:extends MapRedcueBase implements Mapper

     */

    static class MyMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, LongWritable>{

        @Override

        public void map(LongWritable k1, Text v1,

                OutputCollector<Text, LongWritable> collector, Reporter reporter)

                throws IOException {

            final String[] splited = v1.toString().split("\t");

            for (String word : splited) {

                collector.collect(new Text(word), new LongWritable(1));

            }

        }

    }

    static class MyReducer extends MapReduceBase implements Reducer<Text, LongWritable, Text, LongWritable>{

        @Override

        public void reduce(Text k2, Iterator<LongWritable> v2s,

                OutputCollector<Text, LongWritable> collector, Reporter reporter)

                throws IOException {

            long times = 0L;

            while (v2s.hasNext()) {

                final long temp = v2s.next().get();

                times += temp;

            }

            collector.collect(k2, new LongWritable(times));

        }

    }

}

代码 1.1

一、自定义Mapper类的不同

　　在新api中，是继承类org.apache.hadoop.mapreduce.Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>。在旧api中，是继承类org.apache.hadoop.mapred.MapReduceBase，然后实现接口 org.apache.hadoop.mapred.Mapper<K1, V1, K2, V2>。在新api中，覆盖的map方法的第三个参数是Context类；在旧api中，覆盖的map方法的第三、四个形参分别是OutputCollector和Reporter类。在新api的Context中已经把两个类的功能合并到一起了，用户操作更简单。使用旧api的自定义Mapper类，如代码1.2所示所示。key、value对。每一个键值对调用一次map函数。

 /**

      * 新api:extends Mapper

      * 老api:extends MapRedcueBase implements Mapper

      */

     static class MyMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, LongWritable>{

         @Override

         public void map(LongWritable k1, Text v1,

                 OutputCollector<Text, LongWritable> collector, Reporter reporter)

                 throws IOException {

             final String[] splited = v1.toString().split("\t");

             for (String word : splited) {

                 collector.collect(new Text(word), new LongWritable(1));

             }

         }

     }

代码 1.2

二、自定义Reducer类的不同

　　在新api中，是继承类org.apache.hadoop.mapreduce.Reducer<KEYIN, VALUEIN, KEYOUT, VALUEOUT>。在旧api中，是继承类org.apache.hadoop.mapred.MapReduceBase，然后实现接口 org.apache.hadoop.mapred. Reducer<K1, V1, K2, V2>。在新api中覆盖的reduce方法的第二个参数是java.lang.Iterable<VALUEIN>。在旧api中,覆盖的 reduce方法的第二个参数是java.util.Iterator<V 2>。前者可以使用增强for循环进行处理，后者只能使用 while循环处理了。在新api中，覆盖的reduce方法的第三个参数是Context类；在旧api中，覆盖的reduce方法的第三、四个形参分别是OutputCollector和Reporter类。在新api的Context中已经把两个类的功能合并到一起了，用户操作更简单。使用旧api的自定义Reducer类，代码如2.1所示。

 static class MyReducer extends MapReduceBase implements Reducer<Text, LongWritable, Text, LongWritable>{

         @Override

         public void reduce(Text k2, Iterator<LongWritable> v2s,

                 OutputCollector<Text, LongWritable> collector, Reporter reporter)

                 throws IOException {

             long times = 0L;

             while (v2s.hasNext()) {

                 final long temp = v2s.next().get();

                 times += temp;

             }

             collector.collect(k2, new LongWritable(times));

         }

     }

代码 2.1

三、驱动代码main方法的不同

　　在新api中，驱动代码主要是通过org.apache.hadoop.mapreduce.Job类实现的，通过该类管理各种配置，然后调用waitForCompleti on(boolean)方法把代码提交给JobTracker执行。在旧api中，驱动代码主要是通过 org.apache.hadoop.mapred.JobConf.JobConf(Con figuration, Class)类实现的，通过该类管理各种配置。对于job的提交，是通过org.apache.hadoop.mapred.JobClient类的 runJob(JobC onf)方法实现的。可见，新api中把JobConf和JobClient的功能进行了合并，用户调用更方便。

　　其中，JobConf类与Job类的方法名称几乎一致，只是传递的形参类型大不相同了。在新api中的Job类，要求setXXX(…)的形参必须是org .apache.hadoop.mapreduce及其子包下面的类；而旧api中的JobConf类，要求setXXX(…)的形参必须是 org.apache.hadoop.mapred及其子包下面的类。使用旧api的驱动代码main方法，如代码3.1所示。

 package old;

 import java.io.IOException;

 import java.net.URI;

 import java.util.Iterator;

 import mapreduce.WordCountApp;

 import org.apache.hadoop.conf.Configuration;

 import org.apache.hadoop.fs.FileSystem;

 import org.apache.hadoop.fs.Path;

 import org.apache.hadoop.io.LongWritable;

 import org.apache.hadoop.io.Text;

 import org.apache.hadoop.mapred.FileInputFormat;

 import org.apache.hadoop.mapred.FileOutputFormat;

 import org.apache.hadoop.mapred.JobClient;

 import org.apache.hadoop.mapred.JobConf;

 import org.apache.hadoop.mapred.MapReduceBase;

 import org.apache.hadoop.mapred.Mapper;

 import org.apache.hadoop.mapred.OutputCollector;

 import org.apache.hadoop.mapred.Reducer;

 import org.apache.hadoop.mapred.Reporter;

 import org.apache.hadoop.mapred.TextInputFormat;

 import org.apache.hadoop.mapred.TextOutputFormat;

 import org.apache.hadoop.mapred.lib.HashPartitioner;

 /**

  * hadoop版本1.x的包一般是mapreduce

  * hadoop版本0.x的包一般是mapred

  *

  */

 public class OldAPP {

     static final String INPUT_PATH = "hdfs://hadoop:9000/hello";

     static final String OUT_PATH = "hdfs://hadoop:9000/out";

     /**

      * 改动：

      * 1.不再使用Job，而是使用JobConf

      * 2.类的包名不再使用mapreduce，而是使用mapred

      * 3.不再使用job.waitForCompletion(true)提交作业，而是使用JobClient.runJob(job);

      *

      */

     public static void main(String[] args) throws Exception {

         Configuration conf = new Configuration();

         final FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), conf);

         final Path outPath = new Path(OUT_PATH);

         if(fileSystem.exists(outPath)){

             fileSystem.delete(outPath, true);

         }

         final JobConf job = new JobConf(conf , WordCountApp.class);

         FileInputFormat.setInputPaths(job, INPUT_PATH);//1.1指定读取的文件位于哪里

         job.setMapperClass(MyMapper.class);//1.2 指定自定义的map类

         job.setMapOutputKeyClass(Text.class);//map输出的<k,v>类型。如果<k3,v3>的类型与<k2,v2>类型一致，则可以省略

         job.setMapOutputValueClass(LongWritable.class);

         job.setPartitionerClass(HashPartitioner.class);//1.3 分区

         job.setNumReduceTasks(1);//有一个reduce任务运行

         job.setReducerClass(MyReducer.class);//2.2 指定自定义reduce类

         job.setOutputKeyClass(Text.class);//指定reduce的输出类型

         job.setOutputValueClass(LongWritable.class);

         FileOutputFormat.setOutputPath(job, outPath);//2.3 指定写出到哪里

         JobClient.runJob(job);//把job提交给JobTracker运行

     }

     /**

      * 新api:extends Mapper

      * 老api:extends MapRedcueBase implements Mapper

      */

     static class MyMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, LongWritable>{

         @Override

         public void map(LongWritable k1, Text v1,

                 OutputCollector<Text, LongWritable> collector, Reporter reporter)

                 throws IOException {

             final String[] splited = v1.toString().split("\t");

             for (String word : splited) {

                 collector.collect(new Text(word), new LongWritable(1));

             }

         }

     }

     static class MyReducer extends MapReduceBase implements Reducer<Text, LongWritable, Text, LongWritable>{

         @Override

         public void reduce(Text k2, Iterator<LongWritable> v2s,

                 OutputCollector<Text, LongWritable> collector, Reporter reporter)

                 throws IOException {

             long times = 0L;

             while (v2s.hasNext()) {

                 final long temp = v2s.next().get();

                 times += temp;

             }

             collector.collect(k2, new LongWritable(times));

         }

     }

 }

代码 3.1

秒客网

Hadoop日记Day15---MapReduce新旧api的比较

一、自定义Mapper类的不同

二、自定义Reducer类的不同

三、驱动代码main方法的不同

Hadoop日记Day15---MapReduce新旧api的比较

一、自定义Mapper类的不同

二、自定义Reducer类的不同

三、 驱动代码main方法的不同

三、驱动代码main方法的不同