原推送引用:https://mp.weixin.qq.com/s/3qQqN6qzQ3a8_Au2qfZnVg
版权归原作者所有,如有侵权请及时联系本人,见谅!
原文采用Excel进行统计数据,这里采用刚学习的工具进行练习。
import java.io.IOException; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; /**
* https://mp.weixin.qq.com/s/3qQqN6qzQ3a8_Au2qfZnVg
* 针对[新兴生态系统:Python和R语言,谁更适用于大数据Spark/Hadoop和深度学习?]
* 的全球数据进行一系列统计
*/
public class wechat extends Configured implements Tool { /**
* Map方法
*/
private static class ModuleMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
private static final IntWritable mapOutputValue = new IntWritable(1) ;
private Text mapOutputKey = new Text() ;
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException { String input = value.toString();
if(input.split(",").length<16) {
return;
}
String[] arrStr = input.split(",");
//Python-大数据计数器输出
if("1".equals(arrStr[2])&&"1".equals(arrStr[14])) {
context.getCounter("WECHAT_MAPPER_COUNTERS", "Python_BigData").increment(1L);
}
//Python-Deep计数器输出
if("1".equals(arrStr[2])&&"1".equals(arrStr[13])) {
context.getCounter("WECHAT_MAPPER_COUNTERS", "Python_Deep-Learning").increment(1L);
}
//R-大数据计数器输出
if("1".equals(arrStr[3])&&"1".equals(arrStr[14])) {
context.getCounter("WECHAT_MAPPER_COUNTERS", "R_BigData").increment(1L);
}
//R-深度计数器输出
if("1".equals(arrStr[3])&&"1".equals(arrStr[13])) {
context.getCounter("WECHAT_MAPPER_COUNTERS", "R_Deep-Learning").increment(1L);
} arrStr = input.split(",")[16].split(";");
//遍历
for(String tool: arrStr){
// 设置key
mapOutputKey.set(tool);
// 输出
context.write(mapOutputKey, mapOutputValue) ;
}
}
} /**
* Reduce聚合结果
*/
private static class ModuleReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
private IntWritable outputValue = new IntWritable() ;
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException { // 定义临时变量,用于累加
int sum = 0 ; // 遍历
for(IntWritable value: values){
sum += value.get() ;
} if(sum < 500){
// 定义500以上的筛选
return ;
}
// 设置
outputValue.set(sum) ;
// 输出
context.write(key, outputValue) ; }
} /**
* 驱动创建Job并提交运行 返回状态码
*/ public int run(String[] args) throws Exception {
// 创建一个Job
Job job = Job.getInstance(
this.getConf() , wechat.class.getSimpleName()
) ;
// 设置job运行的class
job.setJarByClass(wechat.class); // 设置Job
// 1. 设置 input,从哪里读取数据
Path inputPath = new Path(args[0]) ;
FileInputFormat.addInputPath(job, inputPath); // 2. 设置 mapper类
job.setMapperClass(ModuleMapper.class);
// 设置map 输出的key和value的数据类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class); // 3. 设置 reducer 类
job.setReducerClass(ModuleReducer.class);
// 设置 reducer 输出的key和value的数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 设置ReduceTask个数
// job.setNumReduceTasks(2); // 4. 设置 处理结果保存的路径
Path outputPath = new Path(args[1]) ;
FileOutputFormat.setOutputPath(job, outputPath); // 提交job运行
boolean isSuccess = job.waitForCompletion(true) ; // 返回状态
return isSuccess ? 0 : 1;
} /**
*
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
if(2 > args.length){
System.out.println("Usage: " + wechat.class.getSimpleName() +" <in> <out>");
return ;
} // 读取HADOOP中配置文件, core-*.xml hdfs-*.xml yarn-*.xml mapred-*.xml
Configuration conf = new Configuration() ; // 运行Job
int status = ToolRunner.run(conf, new wechat(), args) ; // exit program
System.exit(status);
} }