参考代码
TVPlayCount.java
package com.dajiangtai.hadoop.tvplay; import java.io.IOException; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; import com.sun.org.apache.bcel.internal.generic.NEW; public class TVPlayCount extends Configured implements Tool{ public static class TVPlayMapper extends Mapper<Text, TVPlayData, Text, TVPlayData>{
@Override
protected void map(Text key, TVPlayData value,Context context)
throws IOException, InterruptedException
{
context.write(key, value);
}
} public static class TVPlayReducer extends Reducer<Text, TVPlayData, Text, Text>
{
private Text m_key=new Text();
private Text m_value = new Text();
private MultipleOutputs<Text, Text> mos; //将多路输出打开
protected void setup(Context context) throws IOException,InterruptedException
{
mos = new MultipleOutputs<Text, Text>(context);
} protected void reduce (Text Key,Iterable<TVPlayData> Values, Context context)
throws IOException, InterruptedException{
int daynumber = ;
int collectnumber = ;
int commentnumber = ;
int againstnumber = ;
int supportnumber = ; for (TVPlayData tv : Values){
daynumber+=tv.getDaynumber();
collectnumber+=tv.getCollectnumber();
commentnumber += tv.getCommentnumber();
againstnumber += tv.getAgainstnumber();
supportnumber += tv.getSupportnumber();
} String[] records=Key.toString().split("\t"); // 1优酷 2搜狐 3 土豆 4爱奇艺 5迅雷看看
String source =records[]; // 媒体类别
m_key.set(records[]);
m_value.set(daynumber+"\t"+collectnumber+"\t" +commentnumber+"\t"+againstnumber+"\t"+supportnumber);
if(source.equals("")){
mos.write("youku", m_key, m_value);
}else if (source.equals("")) {
mos.write("souhu", m_key, m_value);
} else if (source.equals("")) {
mos.write("tudou", m_key, m_value);
} else if (source.equals("")) {
mos.write("aiqiyi", m_key, m_value);
} else if (source.equals("")) {
mos.write("xunlei", m_key, m_value);
}
} //关闭 MultipleOutputs,也就是关闭 RecordWriter,并且是一堆 RecordWriter,因为这里会有很多 reduce 被调用。
protected void cleanup( Context context) throws IOException,InterruptedException {
mos.close();
}
} @Override
public int run(String[] args) throws Exception {
Configuration conf = new Configuration(); // 配置文件对象
Path mypath = new Path(args[]);
FileSystem hdfs = mypath.getFileSystem(conf);// 创建输出路径
if (hdfs.isDirectory(mypath)) {
hdfs.delete(mypath, true);
} Job job = new Job(conf, "tvplay");// 构造任务
job.setJarByClass(TVPlayCount.class);// 设置主类 job.setMapperClass(TVPlayMapper.class);// 设置Mapper
job.setMapOutputKeyClass(Text.class);// key输出类型
job.setMapOutputValueClass(TVPlayData.class);// value输出类型
job.setInputFormatClass(TVPlayInputFormat.class);//自定义输入格式 job.setReducerClass(TVPlayReducer.class);// 设置Reducer
job.setOutputKeyClass(Text.class);// reduce key类型
job.setOutputValueClass(Text.class);// reduce value类型
// 自定义文件输出格式,通过路径名(pathname)来指定输出路径
MultipleOutputs.addNamedOutput(job, "youku", TextOutputFormat.class,
Text.class, Text.class);
MultipleOutputs.addNamedOutput(job, "souhu", TextOutputFormat.class,
Text.class, Text.class);
MultipleOutputs.addNamedOutput(job, "tudou", TextOutputFormat.class,
Text.class, Text.class);
MultipleOutputs.addNamedOutput(job, "aiqiyi", TextOutputFormat.class,
Text.class, Text.class);
MultipleOutputs.addNamedOutput(job, "xunlei", TextOutputFormat.class,
Text.class, Text.class); FileInputFormat.addInputPath(job, new Path(args[]));// 输入路径
FileOutputFormat.setOutputPath(job, new Path(args[]));// 输出路径
job.waitForCompletion(true);
return ;
} public static void main(String[] args) throws Exception{
String[] args0={"hdfs://master:9000/tvplay/",
"hdfs://master:9000/tvplay/out"};
int ec = ToolRunner.run(new Configuration(), new TVPlayCount(), args0);
System.exit(ec);
}
}
TVPlayData.java
package com.dajiangtai.hadoop.tvplay; import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException; import org.apache.hadoop.io.WritableComparable;
/**
*
* @author yangjun
* @function 自定义对象
*/
public class TVPlayData implements WritableComparable<Object>{
private int daynumber;
private int collectnumber;
private int commentnumber;
private int againstnumber;
private int supportnumber;
public TVPlayData(){}
public void set(int daynumber,int collectnumber,int commentnumber,int againstnumber,int supportnumber){
this.daynumber = daynumber;
this.collectnumber = collectnumber;
this.commentnumber = commentnumber;
this.againstnumber = againstnumber;
this.supportnumber = supportnumber;
}
public int getDaynumber() {
return daynumber;
}
public void setDaynumber(int daynumber) {
this.daynumber = daynumber;
}
public int getCollectnumber() {
return collectnumber;
}
public void setCollectnumber(int collectnumber) {
this.collectnumber = collectnumber;
}
public int getCommentnumber() {
return commentnumber;
}
public void setCommentnumber(int commentnumber) {
this.commentnumber = commentnumber;
}
public int getAgainstnumber() {
return againstnumber;
}
public void setAgainstnumber(int againstnumber) {
this.againstnumber = againstnumber;
}
public int getSupportnumber() {
return supportnumber;
}
public void setSupportnumber(int supportnumber) {
this.supportnumber = supportnumber;
}
@Override
public void readFields(DataInput in) throws IOException {
daynumber = in.readInt();
collectnumber = in.readInt();
commentnumber = in.readInt();
againstnumber = in.readInt();
supportnumber = in.readInt();
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(daynumber);
out.writeInt(collectnumber);
out.writeInt(commentnumber);
out.writeInt(againstnumber);
out.writeInt(supportnumber);
}
@Override
public int compareTo(Object o) {
return ;
};
}
TVPlayInputFormat.java
package com.dajiangtai.hadoop.tvplay; import java.io.IOException; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.LineReader;
/**
*
* @author yangjun
* @function key vlaue 输入格式
*/
public class TVPlayInputFormat extends FileInputFormat<Text,TVPlayData>{ @Override
public RecordReader<Text, TVPlayData> createRecordReader(InputSplit input,
TaskAttemptContext context) throws IOException, InterruptedException {
return new TVPlayRecordReader();
} public class TVPlayRecordReader extends RecordReader<Text, TVPlayData>{
public LineReader in;
public Text lineKey;
public TVPlayData lineValue;
public Text line;
@Override
public void close() throws IOException {
if(in !=null){
in.close();
}
} @Override
public Text getCurrentKey() throws IOException, InterruptedException {
return lineKey;
} @Override
public TVPlayData getCurrentValue() throws IOException, InterruptedException {
return lineValue;
} @Override
public float getProgress() throws IOException, InterruptedException {
return ;
} @Override
public void initialize(InputSplit input, TaskAttemptContext context)
throws IOException, InterruptedException {
FileSplit split=(FileSplit)input;
Configuration job=context.getConfiguration();
Path file=split.getPath();
FileSystem fs=file.getFileSystem(job); FSDataInputStream filein=fs.open(file);
in=new LineReader(filein,job);
line=new Text();
lineKey=new Text();
lineValue = new TVPlayData();
} @Override
public boolean nextKeyValue() throws IOException, InterruptedException {
int linesize=in.readLine(line);
if(linesize==) return false;
String[] pieces = line.toString().split("\t");
if(pieces.length != ){
throw new IOException("Invalid record received");
}
lineKey.set(pieces[]+"\t"+pieces[]);
lineValue.set(Integer.parseInt(pieces[]),Integer.parseInt(pieces[]),Integer.parseInt(pieces[])
,Integer.parseInt(pieces[]),Integer.parseInt(pieces[]));
return true;
}
}
}
先启动3节点集群
与自己在本地搭建的3节点集群的hdfs连接上
在终端显示的运行结果,程序没有错误
-- ::, INFO [org.apache.hadoop.conf.Configuration.deprecation] - session.id is deprecated. Instead, use dfs.metrics.session-id
-- ::, INFO [org.apache.hadoop.metrics.jvm.JvmMetrics] - Initializing JVM Metrics with processName=JobTracker, sessionId=
-- ::, WARN [org.apache.hadoop.mapreduce.JobSubmitter] - Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
-- ::, WARN [org.apache.hadoop.mapreduce.JobSubmitter] - No job jar file set. User classes may not be found. See Job or Job#setJar(String).
-- ::, INFO [org.apache.hadoop.mapreduce.lib.input.FileInputFormat] - Total input paths to process :
-- ::, INFO [org.apache.hadoop.mapreduce.JobSubmitter] - number of splits:
-- ::, INFO [org.apache.hadoop.conf.Configuration.deprecation] - user.name is deprecated. Instead, use mapreduce.job.user.name
-- ::, INFO [org.apache.hadoop.conf.Configuration.deprecation] - mapred.output.value.class is deprecated. Instead, use mapreduce.job.output.value.class
-- ::, INFO [org.apache.hadoop.conf.Configuration.deprecation] - mapred.mapoutput.value.class is deprecated. Instead, use mapreduce.map.output.value.class
-- ::, INFO [org.apache.hadoop.conf.Configuration.deprecation] - mapreduce.map.class is deprecated. Instead, use mapreduce.job.map.class
-- ::, INFO [org.apache.hadoop.conf.Configuration.deprecation] - mapred.job.name is deprecated. Instead, use mapreduce.job.name
-- ::, INFO [org.apache.hadoop.conf.Configuration.deprecation] - mapreduce.reduce.class is deprecated. Instead, use mapreduce.job.reduce.class
-- ::, INFO [org.apache.hadoop.conf.Configuration.deprecation] - mapreduce.inputformat.class is deprecated. Instead, use mapreduce.job.inputformat.class
-- ::, INFO [org.apache.hadoop.conf.Configuration.deprecation] - mapred.input.dir is deprecated. Instead, use mapreduce.input.fileinputformat.inputdir
-- ::, INFO [org.apache.hadoop.conf.Configuration.deprecation] - mapred.output.dir is deprecated. Instead, use mapreduce.output.fileoutputformat.outputdir
-- ::, INFO [org.apache.hadoop.conf.Configuration.deprecation] - mapred.map.tasks is deprecated. Instead, use mapreduce.job.maps
-- ::, INFO [org.apache.hadoop.conf.Configuration.deprecation] - mapred.output.key.class is deprecated. Instead, use mapreduce.job.output.key.class
-- ::, INFO [org.apache.hadoop.conf.Configuration.deprecation] - mapred.mapoutput.key.class is deprecated. Instead, use mapreduce.map.output.key.class
-- ::, INFO [org.apache.hadoop.conf.Configuration.deprecation] - mapred.working.dir is deprecated. Instead, use mapreduce.job.working.dir
-- ::, INFO [org.apache.hadoop.mapreduce.JobSubmitter] - Submitting tokens for job: job_local300699497_0001
-- ::, WARN [org.apache.hadoop.conf.Configuration] - file:/tmp/hadoop-Administrator/mapred/staging/Administrator300699497/.staging/job_local300699497_0001/job.xml:an attempt to override final parameter: mapreduce.job.end-notification.max.retry.interval; Ignoring.
-- ::, WARN [org.apache.hadoop.conf.Configuration] - file:/tmp/hadoop-Administrator/mapred/staging/Administrator300699497/.staging/job_local300699497_0001/job.xml:an attempt to override final parameter: mapreduce.job.end-notification.max.attempts; Ignoring.
-- ::, WARN [org.apache.hadoop.conf.Configuration] - file:/tmp/hadoop-Administrator/mapred/local/localRunner/Administrator/job_local300699497_0001/job_local300699497_0001.xml:an attempt to override final parameter: mapreduce.job.end-notification.max.retry.interval; Ignoring.
-- ::, WARN [org.apache.hadoop.conf.Configuration] - file:/tmp/hadoop-Administrator/mapred/local/localRunner/Administrator/job_local300699497_0001/job_local300699497_0001.xml:an attempt to override final parameter: mapreduce.job.end-notification.max.attempts; Ignoring.
-- ::, INFO [org.apache.hadoop.mapreduce.Job] - The url to track the job: http://localhost:8080/
-- ::, INFO [org.apache.hadoop.mapreduce.Job] - Running job: job_local300699497_0001
-- ::, INFO [org.apache.hadoop.mapred.LocalJobRunner] - OutputCommitter set in config null
-- ::, INFO [org.apache.hadoop.mapred.LocalJobRunner] - OutputCommitter is org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
-- ::, INFO [org.apache.hadoop.mapred.LocalJobRunner] - Waiting for map tasks
-- ::, INFO [org.apache.hadoop.mapred.LocalJobRunner] - Starting task: attempt_local300699497_0001_m_000000_0
-- ::, INFO [org.apache.hadoop.yarn.util.ProcfsBasedProcessTree] - ProcfsBasedProcessTree currently is supported only on Linux.
-- ::, INFO [org.apache.hadoop.mapred.Task] - Using ResourceCalculatorProcessTree : org.apache.hadoop.yarn.util.WindowsBasedProcessTree@1b9156ad
-- ::, INFO [org.apache.hadoop.mapred.MapTask] - Processing split: hdfs://master:9000/tvplay/tvplay.txt:0+10833923
-- ::, INFO [org.apache.hadoop.mapreduce.Job] - Job job_local300699497_0001 running in uber mode : false
-- ::, INFO [org.apache.hadoop.mapreduce.Job] - map % reduce %
-- ::, INFO [org.apache.hadoop.mapred.MapTask] - Map output collector class = org.apache.hadoop.mapred.MapTask$MapOutputBuffer
-- ::, INFO [org.apache.hadoop.mapred.MapTask] - (EQUATOR) kvi ()
-- ::, INFO [org.apache.hadoop.mapred.MapTask] - mapreduce.task.io.sort.mb:
-- ::, INFO [org.apache.hadoop.mapred.MapTask] - soft limit at
-- ::, INFO [org.apache.hadoop.mapred.MapTask] - bufstart = ; bufvoid =
-- ::, INFO [org.apache.hadoop.mapred.MapTask] - kvstart = ; length =
-- ::, INFO [org.apache.hadoop.mapred.LocalJobRunner] -
-- ::, INFO [org.apache.hadoop.mapred.MapTask] - Starting flush of map output
-- ::, INFO [org.apache.hadoop.mapred.MapTask] - Spilling map output
-- ::, INFO [org.apache.hadoop.mapred.MapTask] - bufstart = ; bufend = ; bufvoid =
-- ::, INFO [org.apache.hadoop.mapred.MapTask] - kvstart = (); kvend = (); length = /
-- ::, INFO [org.apache.hadoop.mapred.MapTask] - Finished spill
-- ::, INFO [org.apache.hadoop.mapred.Task] - Task:attempt_local300699497_0001_m_000000_0 is done. And is in the process of committing
-- ::, INFO [org.apache.hadoop.mapred.LocalJobRunner] - map
-- ::, INFO [org.apache.hadoop.mapred.Task] - Task 'attempt_local300699497_0001_m_000000_0' done.
-- ::, INFO [org.apache.hadoop.mapred.LocalJobRunner] - Finishing task: attempt_local300699497_0001_m_000000_0
-- ::, INFO [org.apache.hadoop.mapred.LocalJobRunner] - Map task executor complete.
-- ::, INFO [org.apache.hadoop.yarn.util.ProcfsBasedProcessTree] - ProcfsBasedProcessTree currently is supported only on Linux.
-- ::, INFO [org.apache.hadoop.mapred.Task] - Using ResourceCalculatorProcessTree : org.apache.hadoop.yarn.util.WindowsBasedProcessTree@fba110e
-- ::, INFO [org.apache.hadoop.mapred.Merger] - Merging sorted segments
-- ::, INFO [org.apache.hadoop.mapred.Merger] - Down to the last merge-pass, with segments left of total size: bytes
-- ::, INFO [org.apache.hadoop.mapred.LocalJobRunner] -
-- ::, INFO [org.apache.hadoop.conf.Configuration.deprecation] - mapred.skip.on is deprecated. Instead, use mapreduce.job.skiprecords
-- ::, INFO [org.apache.hadoop.mapreduce.Job] - map % reduce %
-- ::, INFO [org.apache.hadoop.mapred.Task] - Task:attempt_local300699497_0001_r_000000_0 is done. And is in the process of committing
-- ::, INFO [org.apache.hadoop.mapred.LocalJobRunner] -
-- ::, INFO [org.apache.hadoop.mapred.Task] - Task attempt_local300699497_0001_r_000000_0 is allowed to commit now
-- ::, INFO [org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter] - Saved output of task 'attempt_local300699497_0001_r_000000_0' to hdfs://master:9000/tvplay/out/_temporary/0/task_local300699497_0001_r_000000
-- ::, INFO [org.apache.hadoop.mapred.LocalJobRunner] - reduce > reduce
-- ::, INFO [org.apache.hadoop.mapred.Task] - Task 'attempt_local300699497_0001_r_000000_0' done.
-- ::, INFO [org.apache.hadoop.mapreduce.Job] - map % reduce %
-- ::, INFO [org.apache.hadoop.mapreduce.Job] - Job job_local300699497_0001 completed successfully
-- ::, INFO [org.apache.hadoop.mapreduce.Job] - Counters:
File System Counters
FILE: Number of bytes read=
FILE: Number of bytes written=
FILE: Number of read operations=
FILE: Number of large read operations=
FILE: Number of write operations=
HDFS: Number of bytes read=
HDFS: Number of bytes written=
HDFS: Number of read operations=
HDFS: Number of large read operations=
HDFS: Number of write operations=
Map-Reduce Framework
Map input records=
Map output records=
Map output bytes=
Map output materialized bytes=
Input split bytes=
Combine input records=
Combine output records=
Reduce input groups=
Reduce shuffle bytes=
Reduce input records=
Reduce output records=
Spilled Records=
Shuffled Maps =
Failed Shuffles=
Merged Map outputs=
GC time elapsed (ms)=
CPU time spent (ms)=
Physical memory (bytes) snapshot=
Virtual memory (bytes) snapshot=
Total committed heap usage (bytes)=
File Input Format Counters
Bytes Read=
File Output Format Counters
Bytes Written=
查看hdfs上的输出结果