一个典型的MapRuduce实例------webcount(网站统计访客信息)

时间:2023-03-09 12:49:40
一个典型的MapRuduce实例------webcount(网站统计访客信息)

统计某一特定网站的某个时辰访客人数

所用版本:hadoop2.6.5

数据样式如下:

111.111.111.111 - - [16/Dec/2012:05:32:50 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
111.111.111.111 - - [16/Dec/2012:05:33:50 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
111.111.111.111 - - [16/Dec/2012:05:34:45 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
111.111.111.111 - - [16/Dec/2012:05:34:50 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
111.111.111.111 - - [16/Dec/2012:09:34:55 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
111.111.111.111 - - [16/Dec/2012:10:23:30 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
111.111.111.111 - - [16/Dec/2012:10:32:50 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"

辅助类

 package com.trendwise.software;

 import java.text.SimpleDateFormat;
import java.util.Date;
import java.io.DataInput; import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable; public class DateWritable implements WritableComparable<DateWritable>{
private final static SimpleDateFormat formatter = new SimpleDateFormat( "yyyy-MM-dd' T 'HH:mm:ss.SSS" );
private Date date;
public Date getDate() {
return date;
}
public void setDate( Date date ) {
this.date = date;
} @Override
public void readFields(DataInput in) throws IOException {
date = new Date( in.readLong() );
} @Override
public void write(DataOutput out) throws IOException {
out.writeLong( date.getTime() );
} @Override
public int compareTo(DateWritable o) {
return date.compareTo( o.getDate() );
} public String toString() {
return formatter.format( date);
}
}

mapper 映射特定年份中每月每天每个时辰的访客数

 package com.trendwise.software;

 import java.io.IOException;
import java.util.Calendar;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper; public class LogMapper extends Mapper<LongWritable, Text, DateWritable, IntWritable> {
public static DateWritable dates = new DateWritable();
public final static IntWritable two = new IntWritable(1);
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String text = value.toString();
// Get the date and time
int openBracket = text.indexOf( '[' );
int closeBracket = text.indexOf( ']' );
if( openBracket != -1 && closeBracket != -1 ) {
// Read the date
String dateString = text.substring( text.indexOf( '[' ) + 1, text. indexOf( ']' ) );
// Build a date object from a string of the form: 16/Dec/2012:05:32:50 -0500
int index = 0;
int nextIndex = dateString.indexOf( '/' );
int day = Integer.parseInt( dateString.substring(index, nextIndex) ); index = nextIndex; nextIndex = dateString.indexOf( '/', index+1 );
String month = dateString.substring( index+1, nextIndex );
index = nextIndex;
nextIndex = dateString.indexOf( ':', index );
int year = Integer.parseInt(dateString.substring(index + 1, nextIndex));
index = nextIndex; nextIndex = dateString.indexOf( ':', index+1 );
int hour = Integer.parseInt(dateString.substring(index + 1, nextIndex));
// Build a calendar object for this date
Calendar calendar = Calendar.getInstance();
calendar.set( Calendar.DATE, day );
calendar.set( Calendar.YEAR, year );
calendar.set( Calendar.HOUR, hour );
calendar.set( Calendar.MINUTE, 0 );
calendar.set( Calendar.SECOND, 0 );
calendar.set( Calendar.MILLISECOND, 0 );
if( month.equalsIgnoreCase( "dec" ) ) {
calendar.set( Calendar.MONTH, Calendar.DECEMBER );
}
else if( month.equalsIgnoreCase( "nov" ) ) {
calendar.set( Calendar.MONTH, Calendar.NOVEMBER );
}
else if( month.equalsIgnoreCase( "oct" ) ) {
calendar.set( Calendar.MONTH, Calendar.OCTOBER );
}
else if( month.equalsIgnoreCase( "sep" ) ) {
calendar.set( Calendar.MONTH, Calendar.SEPTEMBER );
}
else if( month.equalsIgnoreCase( "aug" ) ) {
calendar.set( Calendar.MONTH, Calendar.AUGUST );
}
else if( month.equalsIgnoreCase( "jul" ) ) {
calendar.set( Calendar.MONTH, Calendar.JULY );
}
else if( month.equalsIgnoreCase( "jun" ) ) {
calendar.set( Calendar.MONTH, Calendar.JUNE );
}
else if( month.equalsIgnoreCase( "may" ) ) {
calendar.set( Calendar.MONTH, Calendar.MAY );
}
else if( month.equalsIgnoreCase( "apr" ) ) {
calendar.set( Calendar.MONTH, Calendar.APRIL );
}
else if( month.equalsIgnoreCase( "mar" ) ) {
calendar.set( Calendar.MONTH, Calendar.MARCH );
}
else if( month.equalsIgnoreCase( "feb" ) ) {
calendar.set( Calendar.MONTH, Calendar.FEBRUARY );
}
else if( month.equalsIgnoreCase( "jan" ) ) {
calendar.set( Calendar.MONTH, Calendar.JANUARY );
} dates.setDate( calendar.getTime() );
context.write(dates, two); }
}
}

reducer 汇总一个时辰内访客人数

 package com.trendwise.software;

 import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer; public class LogReducer extends Reducer<DateWritable, IntWritable, DateWritable, IntWritable> {
@Override
public void reduce( DateWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int countn = 0;
for(IntWritable v :values){
countn += v.get();
}
context.write(key, new IntWritable( countn) );
}
}

driver 配置信息,程序入口

 package com.trendwise.software;

 import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class Driver { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { String in = args[0];
String out = args[1];
int unitmb =Integer.valueOf(args[2]);
int nreducer = Integer.valueOf(args[3]); Configuration conf = new Configuration();
conf.set("mapreduce.input.fileinputformat.split.maxsize", String.valueOf(unitmb * 1024 * 1024));
conf.set("mapred.min.split.size", String.valueOf(unitmb * 1024 * 1024));
conf.set("mapreduce.input.fileinputformat.split.minsize.per.node", String.valueOf(unitmb * 1024 * 1024));
conf.set("mapreduce.input.fileinputformat.split.minsize.per.rack", String.valueOf(unitmb * 1024 * 1024)); Job job = new Job(conf);
FileInputFormat.addInputPath(job, new Path(in));
FileOutputFormat.setOutputPath(job, new Path(out));
job.setMapperClass(LogMapper.class);
job.setReducerClass(LogReducer.class);
job.setCombinerClass(LogReducer.class);
job.setNumReduceTasks(nreducer);
job.setMapOutputKeyClass(DateWritable.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(DateWritable.class);
job.setOutputValueClass(IntWritable.class);
job.setJarByClass(Driver.class);
job.waitForCompletion(true); }
}

command

一个典型的MapRuduce实例------webcount(网站统计访客信息)

result

一个典型的MapRuduce实例------webcount(网站统计访客信息)

一个典型的MapRuduce实例------webcount(网站统计访客信息)