MR案例：内连接代码实现

本文是对Hive中【内连接】的Java-API的实现，具体的HQL语句详见Hive查询Join
package join.map;

import java.io.IOException;

import java.util.ArrayList;

import java.util.List;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.VLongWritable;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class JoinOn {

    public static void main(String[] args) throws Exception {

        //临时配置windows的环境变量

        System.setProperty("hadoop.home.dir", "D:\\workspace\\hadoop-2.2.0");

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(JoinOn.class);

        job.setMapperClass(JOMapper.class);

        job.setReducerClass(JOReducer.class);

        job.setMapOutputKeyClass(VLongWritable.class);

        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(Text.class);

        FileInputFormat.addInputPath(job, new Path(args[0]));

        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        System.exit(job.waitForCompletion(true)? 0:1);

    }

    public static class JOMapper extends Mapper<LongWritable, Text, VLongWritable, Text>{

        @Override

        protected void map(LongWritable key, Text value, Context context)

                throws IOException, InterruptedException {

            //获取当前分片所对应的文件名(优化:放在setup()方法中)

            String name = ((FileSplit)context.getInputSplit()).getPath().getName();

            String[] splited = value.toString().split("\t");

            if(name.endsWith("sales")){

                //sales表(注意:第二个参数中name和value之间的分隔符不能和value中个字段分隔符一样)

                //<key,value> --> <id, things+':'+name+'\t'+id>

                context.write(new VLongWritable(Long.parseLong(splited[1])), new Text(name+":"+value.toString()));

            }else if(name.endsWith("things")) {

                //<key,value> --> <id, sales+'\t'+id+'\t'+name>

                context.write(new VLongWritable(Long.parseLong(splited[0])), new Text(name+":"+value.toString()));

            }

        }

    }

    public static class JOReducer extends Reducer<VLongWritable, Text, Text, Text>{

        @Override

        protected void reduce(VLongWritable key, Iterable<Text> v2s, Context context)

                throws IOException, InterruptedException {

            //分别存储sales和things两表的name

            List<String> sales=new ArrayList<String>();

            List<String> things=new ArrayList<String>();

            for(Text text : v2s){

                String[] splited = text.toString().split(":");

                //sales表中的数据

                if(splited[0].endsWith("sales")){

                    //加入集合

                    sales.add(splited[1]);

                }

                //things表中数据

                else if(splited[0].endsWith("things")){

                    things.add(splited[1]);

                }

            }

            //笛卡尔积

            if(sales.size()!=0 && things.size()!=0){

                for(String sale : sales){

                    for(String thing : things){

                        context.write(new Text(sale), new Text(thing));

                    }

                }

            }

        }

    }

}
总结：
　1).程序中获取FileName应放置中setup()方法中，因为每个文件只需执行一次此方法
　2).Map输出的第二个参数中name和value之间的分隔符不能和value中个字段分隔符一样
秒客网

MR案例：内连接代码实现

相关文章