mapReducer 去重副的单词

时间:2023-03-10 08:23:44
mapReducer 去重副的单词

需求是: 统计输出某目录文件的所有单词,去除重复的单词。

mapper阶段正常做map工作,映射。 切割单词。 <key,value> -->  <word,nullWritable>

reducer阶段,对于同一个key 的一组信息,是只输出第一个。

mapper 和wordcount 的单词数是一样的。

package com.mapreduce.mapper;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper; public class DistinctMapper extends Mapper<LongWritable, Text, Text, NullWritable>{ Text text = new Text();
protected void map(LongWritable key, Text value,Context context)
throws IOException, InterruptedException {
String line = value.toString();
String worlds[] = line.split(" ");
for( String word:worlds ){
text.set(word);
context.write(text, NullWritable.get());
}
} }

reducer 对于同一个key 的一组, 只输出一个就ok 了。(  ... ... )

package com.mapreduce.mapper;

import java.io.IOException;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer; public class DistincReducer extends Reducer<Text, NullWritable, Text, NullWritable>{ @Override
protected void reduce(Text key, Iterable<NullWritable> value, Context context)
throws IOException, InterruptedException { context.write(key, NullWritable.get());
} }

job 提交

package com.mapreduce.mapper;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; public class DriverDemo { public static void main(String[] args) throws Exception, IOException { Configuration configuration = new Configuration(); // 2 job Job job = Job.getInstance(configuration); // 3 作业jar包 job.setJarByClass(DriverDemo.class); // 4 map, reduce jar 包
job.setMapperClass(DistinctMapper.class);
job.setReducerClass(DistincReducer.class);
// 5 map 输出类型 job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class); // 6 最终 输出类型 (reducer) job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class); // 7 inputformatclass , outputformatclass 输入输出入文件类型 可能决定分片信息 job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class); // 8 输入输出文件路径 FileInputFormat.setInputPaths(job, new Path("d:/input"));
FileOutputFormat.setOutputPath(job, new Path("d:/output5")); // 9 job提交 job.waitForCompletion(true);
} }