针对微信的一篇推送附有的数据链接进行MapReduce统计

原推送引用：https://mp.weixin.qq.com/s/3qQqN6qzQ3a8_Au2qfZnVg
版权归原作者所有，如有侵权请及时联系本人，见谅！
原文采用Excel进行统计数据，这里采用刚学习的工具进行练习。
 import java.io.IOException;

 import org.apache.hadoop.conf.Configuration;

 import org.apache.hadoop.conf.Configured;

 import org.apache.hadoop.fs.Path;

 import org.apache.hadoop.io.IntWritable;

 import org.apache.hadoop.io.LongWritable;

 import org.apache.hadoop.io.Text;

 import org.apache.hadoop.mapreduce.Job;

 import org.apache.hadoop.mapreduce.Mapper;

 import org.apache.hadoop.mapreduce.Reducer;

 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 import org.apache.hadoop.util.Tool;

 import org.apache.hadoop.util.ToolRunner;

 /**

  * https://mp.weixin.qq.com/s/3qQqN6qzQ3a8_Au2qfZnVg

  * 针对[新兴生态系统：Python和R语言，谁更适用于大数据Spark/Hadoop和深度学习？]

  * 的全球数据进行一系列统计

  */

 public class wechat  extends Configured implements Tool {

     /**

      * Map方法

      */

     private static class ModuleMapper extends Mapper<LongWritable, Text, Text, IntWritable>{

         private static final IntWritable mapOutputValue = new IntWritable(1) ;

         private Text mapOutputKey = new Text() ;

         @Override

         public void map(LongWritable key, Text value, Context context)

             throws IOException, InterruptedException {

             String input = value.toString();

             if(input.split(",").length<16) {

                 return;

             }

             String[] arrStr = input.split(",");

             //Python-大数据计数器输出

             if("1".equals(arrStr[2])&&"1".equals(arrStr[14])) {

                 context.getCounter("WECHAT_MAPPER_COUNTERS", "Python_BigData").increment(1L);

             }

             //Python-Deep计数器输出

             if("1".equals(arrStr[2])&&"1".equals(arrStr[13])) {

                 context.getCounter("WECHAT_MAPPER_COUNTERS", "Python_Deep-Learning").increment(1L);

             }

             //R-大数据计数器输出

             if("1".equals(arrStr[3])&&"1".equals(arrStr[14])) {

                 context.getCounter("WECHAT_MAPPER_COUNTERS", "R_BigData").increment(1L);

             }

             //R-深度计数器输出

             if("1".equals(arrStr[3])&&"1".equals(arrStr[13])) {

                 context.getCounter("WECHAT_MAPPER_COUNTERS", "R_Deep-Learning").increment(1L);

             }

             arrStr = input.split(",")[16].split(";");

             //遍历

             for(String tool: arrStr){

                 // 设置key

                 mapOutputKey.set(tool);

                 // 输出

                 context.write(mapOutputKey, mapOutputValue) ;

             }

         }

     }

     /**

      * Reduce聚合结果

      */

     private static class ModuleReducer extends Reducer<Text, IntWritable, Text, IntWritable>{

         private IntWritable outputValue = new IntWritable() ;

         @Override

         protected void reduce(Text key, Iterable<IntWritable> values, Context context)

                 throws IOException, InterruptedException {

             // 定义临时变量，用于累加

             int sum = 0 ; 

             // 遍历

             for(IntWritable value: values){

                 sum += value.get() ;

             }

             if(sum < 500){

                 // 定义500以上的筛选

                 return ;

             }

             // 设置

             outputValue.set(sum) ;

             // 输出

             context.write(key, outputValue) ;

         }

     }

     /**

      * 驱动创建Job并提交运行  返回状态码

      */

     public int run(String[] args) throws Exception {

         // 创建一个Job

         Job job = Job.getInstance(

             this.getConf() , wechat.class.getSimpleName()

         ) ;

         // 设置job运行的class

         job.setJarByClass(wechat.class);

         // 设置Job

         // 1. 设置 input，从哪里读取数据

         Path inputPath = new Path(args[0]) ;

         FileInputFormat.addInputPath(job, inputPath);

         // 2. 设置 mapper类

         job.setMapperClass(ModuleMapper.class);

         // 设置map 输出的key和value的数据类型

         job.setMapOutputKeyClass(Text.class);

         job.setMapOutputValueClass(IntWritable.class);

         // 3. 设置 reducer 类

         job.setReducerClass(ModuleReducer.class);

         // 设置 reducer 输出的key和value的数据类型

         job.setOutputKeyClass(Text.class);

         job.setOutputValueClass(IntWritable.class);

         // 设置ReduceTask个数

        // job.setNumReduceTasks(2);

         // 4. 设置 处理结果保存的路径

         Path outputPath = new Path(args[1]) ;

         FileOutputFormat.setOutputPath(job, outputPath);

         // 提交job运行

         boolean isSuccess = job.waitForCompletion(true) ;

         // 返回状态

         return isSuccess ? 0 : 1;

     }

     /**

      *

      * @param args

      * @throws Exception

      */

     public static void main(String[] args) throws Exception {

         if(2 > args.length){

             System.out.println("Usage: " + wechat.class.getSimpleName() +" <in> <out>");

             return ;

         }

         // 读取HADOOP中配置文件, core-*.xml hdfs-*.xml yarn-*.xml mapred-*.xml

         Configuration conf = new Configuration() ;

         // 运行Job

         int status = ToolRunner.run(conf, new wechat(), args) ;

         // exit program

         System.exit(status);

     }

 }