Reducejoin sample

示例文件同sample join analysis
之前的示例是使用map端的join.这次使用reduce端的join.
根据源的类别写不同的mapper，处理不同的文件，输出的key都是studentno.value是其他的信息同时加上类别信息。
然后使用multipleinputs不同的路径注册不同的mapper.
reduce端相同的studentno的学生信息和考试成绩分配给同一个reduce,而且value中包含了这些信息，
把这些信息抽取出来，再做笛卡尔积即可。
下面的示例代码中，我没有使用multipleinputs来处理，自己修改了TextInputFormat的一些信息，使用返回文件名和当前行的信息。
根据文件名我在mapper中处理两个不同文件的信息，加上不同的类别送出去。
下面的代码中还有很多可以优化的地方，以后再更新。
package myexamples;

import java.io.IOException;

import java.util.ArrayList;

import java.util.List;

import org.apache.commons.logging.Log;

import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FSDataInputStream;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.compress.CompressionCodec;

import org.apache.hadoop.io.compress.CompressionCodecFactory;

import org.apache.hadoop.mapreduce.InputSplit;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.JobContext;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.RecordReader;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.TaskAttemptContext;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

import org.apache.hadoop.util.LineReader;

public class reducejoin {

    public static class MyTextInputFormat extends FileInputFormat<Text, Text> {

        @Override

        public MyLineRecordReader createRecordReader(InputSplit split,

                TaskAttemptContext context) {

            return new MyLineRecordReader();

        }

        @Override

        protected boolean isSplitable(JobContext context, Path file) {

            CompressionCodec codec = new CompressionCodecFactory(

                    context.getConfiguration()).getCodec(file);

            return codec == null;

        }

    }

    public static class MyLineRecordReader extends RecordReader<Text, Text> {

        private static final Log LOG = LogFactory

                .getLog(LineRecordReader.class);

        private CompressionCodecFactory compressionCodecs = null;

        private long start;

        private long pos;

        private long end;

        private LineReader in;

        private int maxLineLength;

        private Text key = null;

        private Text value = null;

        Text filename = null;

        public void initialize(InputSplit genericSplit,

                TaskAttemptContext context) throws IOException {

            FileSplit split = (FileSplit) genericSplit;

            Configuration job = context.getConfiguration();

            this.maxLineLength = job.getInt(

                    "mapred.linerecordreader.maxlength", Integer.MAX_VALUE);

            start = split.getStart();

            end = start + split.getLength();

            final Path file = split.getPath();

            key = new Text(file.getName());

            compressionCodecs = new CompressionCodecFactory(job);

            final CompressionCodec codec = compressionCodecs.getCodec(file);

            // open the file and seek to the start of the split

            FileSystem fs = file.getFileSystem(job);

            FSDataInputStream fileIn = fs.open(split.getPath());

            boolean skipFirstLine = false;

            if (codec != null) {

                in = new LineReader(codec.createInputStream(fileIn), job);

                end = Long.MAX_VALUE;

            } else {

                if (start != 0) {

                    skipFirstLine = true;

                    --start;

                    fileIn.seek(start);

                }

                in = new LineReader(fileIn, job);

            }

            if (skipFirstLine) { // skip first line and re-establish "start".

                start += in.readLine(new Text(), 0,

                        (int) Math.min((long) Integer.MAX_VALUE, end - start));

            }

            this.pos = start;

        }

        public boolean nextKeyValue() throws IOException {

            if (key == null) {

            }

            if (value == null) {

                value = new Text();

            }

            int newSize = 0;

            while (pos < end) {

                newSize = in.readLine(value, maxLineLength, Math.max(

                        (int) Math.min(Integer.MAX_VALUE, end - pos),

                        maxLineLength));

                if (newSize == 0) {

                    break;

                }

                pos += newSize;

                if (newSize < maxLineLength) {

                    break;

                }

                // line too long. try again

                LOG.info("Skipped line of size " + newSize + " at pos "

                        + (pos - newSize));

            }

            if (newSize == 0) {

                key = null;

                value = null;

                return false;

            } else {

                return true;

            }

        }

        @Override

        public Text getCurrentKey() {

            return key;

        }

        @Override

        public Text getCurrentValue() {

            return value;

        }

        /**

         * Get the progress within the split

         */

        public float getProgress() {

            if (start == end) {

                return 0.0f;

            } else {

                return Math.min(1.0f, (pos - start) / (float) (end - start));

            }

        }

        public synchronized void close() throws IOException {

            if (in != null) {

                in.close();

            }

        }

    }

    public static class studentMapper extends Mapper<Text, Text, Text, Text> {

        public void map(Text key, Text value, Context context)

                throws IOException, InterruptedException {

            Text newvalue = null;

            String strv = value.toString().substring(

                    value.toString().indexOf(","));

            if (key.toString().contains("student")) // student file

                newvalue = new Text("student" + strv);

            else

                newvalue = new Text("score" + strv);

            Text newkey = new Text(value.toString().substring(0,

                    value.toString().indexOf(",")));

            context.write(newkey, newvalue);

        }

    }

    public static class studentReducer extends Reducer<Text, Text, Text, Text> {

        public void reduce(Text key, Iterable<Text> values, Context context)

                throws IOException, InterruptedException {

            List<String> students = new ArrayList<String>();

            List<String> scores = new ArrayList<String>();

            for (Text value : values)

                if (value.toString().startsWith("student"))

                    students.add(value.toString().substring(8));

                else

                    scores.add(value.toString().substring(6));

            // split real results

            for (String student : students)

                for (String score : scores)

                    context.write(key, new Text(student + "," + score));

        }

    }

    public static void main(String[] args) throws Exception {

        args = "hdfs://namenode:9000/user/hadoop/student/ hdfs://namenode:9000/user/hadoop/reducejoinout"

                .split(" ");

        Configuration conf = new Configuration();

        String[] otherArgs = new GenericOptionsParser(conf, args)

                .getRemainingArgs();

        if (otherArgs.length != 2) {

            System.err.println("Usage: wordcount <in> <out>");

            System.exit(2);

        }

        myUtils.myUtils.DeleteFolder(conf, otherArgs[1]);

        conf.set("io.sort.mb", "10");

        Job job = new Job(conf, "reduce join");

        job.setInputFormatClass(MyTextInputFormat.class);

        // job.setOutputFormatClass(SequenceFileOutputFormat.class);

        job.setJarByClass(reducejoin.class);

        job.setMapperClass(studentMapper.class);

        job.setReducerClass(studentReducer.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(Text.class);

        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));

        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

        System.exit(job.waitForCompletion(true) ? 0 : 1);

    }

}
相关文章