Hadoop学习之第一个MapReduce程序

期望

通过这个mapreduce程序了解mapreduce程序执行的流程，着重从程序解执行的打印信息中提炼出有用信息。

执行前

程序代码

程序代码基本上是《hadoop权威指南》上原封不动搬下来的，目的为求出某一年份中最高气温，相关代码如下：

public class NcdcWeather {

    private String USAF_station_id;

    private String WBAN_station_id;

    private String date;

    private String time;

    private String latitude;

    private String longitude;

    /** 海拔*/

    private String elevation;

    /** 风向*/

    private String wind_direction;

    private String wind_direction_quality_code;

    private String sky_ceiling_height;

    private String sky_ceiling_height_quality_code;

    private String visibility_distance;

    private String visibility_distance_quality_code;

    private String air_temperature;

    private String air_temperature_quality_code;

    private String dew_point_temperature;

    private String dew_point_temperature_quality_code;

    private String atmospheric_pressure;

    private String atmospheric_pressure_quality_code;

    public NcdcWeather(String rowData) {

        if (StringUtils.isEmpty(rowData) || rowData.length() < 105) {

            return;

        }

        USAF_station_id = rowData.substring(4, 10);

        WBAN_station_id = rowData.substring(10, 15);

        date = rowData.substring(15, 23);

        time = rowData.substring(23, 27);

        latitude = rowData.substring(28, 34);

        longitude = rowData.substring(34, 41);

        elevation = rowData.substring(46, 51);

        wind_direction = rowData.substring(60, 63);

        wind_direction_quality_code = rowData.substring(63, 64);

        sky_ceiling_height = rowData.substring(70, 75);

        sky_ceiling_height_quality_code = rowData.substring(75, 76);

        visibility_distance = rowData.substring(78, 84);

        visibility_distance_quality_code = rowData.substring(84, 85);

        air_temperature = rowData.substring(87, 92);

        air_temperature_quality_code = rowData.substring(92, 93);

        dew_point_temperature = rowData.substring(93, 98);

        dew_point_temperature_quality_code = rowData.substring(98, 99);

        atmospheric_pressure = rowData.substring(99, 104);

        atmospheric_pressure_quality_code = rowData.substring(104, 105);

    }

    public String getUSAF_station_id() {

        return USAF_station_id;

    }

    public void setUSAF_station_id(String USAF_station_id) {

        this.USAF_station_id = USAF_station_id;

    }

    public String getWBAN_station_id() {

        return WBAN_station_id;

    }

    public void setWBAN_station_id(String WBAN_station_id) {

        this.WBAN_station_id = WBAN_station_id;

    }

    public String getDate() {

        return date;

    }

    public void setDate(String date) {

        this.date = date;

    }

    public String getTime() {

        return time;

    }

    public void setTime(String time) {

        this.time = time;

    }

    public String getLatitude() {

        return latitude;

    }

    public void setLatitude(String latitude) {

        this.latitude = latitude;

    }

    public String getLongitude() {

        return longitude;

    }

    public void setLongitude(String longitude) {

        this.longitude = longitude;

    }

    public String getElevation() {

        return elevation;

    }

    public void setElevation(String elevation) {

        this.elevation = elevation;

    }

    public String getWind_direction() {

        return wind_direction;

    }

    public void setWind_direction(String wind_direction) {

        this.wind_direction = wind_direction;

    }

    public String getWind_direction_quality_code() {

        return wind_direction_quality_code;

    }

    public void setWind_direction_quality_code(String wind_direction_quality_code) {

        this.wind_direction_quality_code = wind_direction_quality_code;

    }

    public String getSky_ceiling_height() {

        return sky_ceiling_height;

    }

    public void setSky_ceiling_height(String sky_ceiling_height) {

        this.sky_ceiling_height = sky_ceiling_height;

    }

    public String getSky_ceiling_height_quality_code() {

        return sky_ceiling_height_quality_code;

    }

    public void setSky_ceiling_height_quality_code(String sky_ceiling_height_quality_code) {

        this.sky_ceiling_height_quality_code = sky_ceiling_height_quality_code;

    }

    public String getVisibility_distance() {

        return visibility_distance;

    }

    public void setVisibility_distance(String visibility_distance) {

        this.visibility_distance = visibility_distance;

    }

    public String getVisibility_distance_quality_code() {

        return visibility_distance_quality_code;

    }

    public void setVisibility_distance_quality_code(String visibility_distance_quality_code) {

        this.visibility_distance_quality_code = visibility_distance_quality_code;

    }

    public String getAir_temperature() {

        return air_temperature;

    }

    public void setAir_temperature(String air_temperature) {

        this.air_temperature = air_temperature;

    }

    public String getAir_temperature_quality_code() {

        return air_temperature_quality_code;

    }

    public void setAir_temperature_quality_code(String air_temperature_quality_code) {

        this.air_temperature_quality_code = air_temperature_quality_code;

    }

    public String getDew_point_temperature() {

        return dew_point_temperature;

    }

    public void setDew_point_temperature(String dew_point_temperature) {

        this.dew_point_temperature = dew_point_temperature;

    }

    public String getDew_point_temperature_quality_code() {

        return dew_point_temperature_quality_code;

    }

    public void setDew_point_temperature_quality_code(String dew_point_temperature_quality_code) {

        this.dew_point_temperature_quality_code = dew_point_temperature_quality_code;

    }

    public String getAtmospheric_pressure() {

        return atmospheric_pressure;

    }

    public void setAtmospheric_pressure(String atmospheric_pressure) {

        this.atmospheric_pressure = atmospheric_pressure;

    }

    public String getAtmospheric_pressure_quality_code() {

        return atmospheric_pressure_quality_code;

    }

    public void setAtmospheric_pressure_quality_code(String atmospheric_pressure_quality_code) {

        this.atmospheric_pressure_quality_code = atmospheric_pressure_quality_code;

    }

    @Override

    public String toString() {

        return "NcdcWeather{" +

                "USAF_station_id='" + USAF_station_id + '\'' +

                ", WBAN_station_id='" + WBAN_station_id + '\'' +

                ", date='" + date + '\'' +

                ", time='" + time + '\'' +

                ", latitude='" + latitude + '\'' +

                ", longitude='" + longitude + '\'' +

                ", elevation='" + elevation + '\'' +

                ", wind_direction='" + wind_direction + '\'' +

                ", wind_direction_quality_code='" + wind_direction_quality_code + '\'' +

                ", sky_ceiling_height='" + sky_ceiling_height + '\'' +

                ", sky_ceiling_height_quality_code='" + sky_ceiling_height_quality_code + '\'' +

                ", visibility_distance='" + visibility_distance + '\'' +

                ", visibility_distance_quality_code='" + visibility_distance_quality_code + '\'' +

                ", air_temperature='" + air_temperature + '\'' +

                ", air_temperature_quality_code='" + air_temperature_quality_code + '\'' +

                ", dew_point_temperature='" + dew_point_temperature + '\'' +

                ", dew_point_temperature_quality_code='" + dew_point_temperature_quality_code + '\'' +

                ", atmospheric_pressure='" + atmospheric_pressure + '\'' +

                ", atmospheric_pressure_quality_code='" + atmospheric_pressure_quality_code + '\'' +

                '}';

    }

}

Weather Bean

public class MaxTemperatureMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

    private static int MISS_CODE = 9999;

    @Override

    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        String line = value.toString();

        NcdcWeather ncdcWeather = new NcdcWeather(line);

        String year = ncdcWeather.getDate().substring(0, 4);

        int temperature = 0;

        if (ncdcWeather.getAir_temperature().startsWith("+")) {

            temperature = Integer.parseInt(ncdcWeather.getAir_temperature().substring(1));

        } else {

            temperature = Integer.parseInt(ncdcWeather.getAir_temperature());

        }

        if (temperature != MISS_CODE && ncdcWeather.getAir_temperature_quality_code().matches("[01459]")) {

            context.write(new Text(year), new IntWritable(temperature));

        }

    }

}

Mapper

public class MaxTemperatureReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

    @Override

    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

        int max = Integer.MIN_VALUE;

        for (IntWritable temp : values) {

            max = Math.max(max, temp.get());

        }

        context.write(key, new IntWritable(max));

    }

}

Reducer

public class MaxTemperature {

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        if (args.length != 2) {

            System.err.println("Usage: MaxTemperature <input path> <output path>");

            System.exit(-1);

        }

        Job job = Job.getInstance();

        job.setJarByClass(MaxTemperature.class);

        job.setJobName("Max Temperature");

        FileInputFormat.addInputPath(job, new Path(args[0]));

        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        job.setMapperClass(MaxTemperatureMapper.class);

        job.setReducerClass(MaxTemperatureReducer.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(IntWritable.class);

        System.exit(job.waitForCompletion(true) ? 0 : 1);

    }

}

数据准备

提前往hdfs中放了1901、1902两个年份的天气数据如下图所示：
Hadoop学习之第一个MapReduce程序

验证执行

将以上代码打成一个jar，push到我们的虚拟机Hadoop环境中，为防止程序bug也为了节省执行时间，我们先在一个小集群、小数据量中验证代码。为此，我已经提前准备好了一个伪分布式（所谓为分布式即只有一个节点的全分布式）集群环境，下面开始在该环境中执行以上程序：

yarn jar ~/max-temperature-1.0-SNAPSHOT-jar-with-dependencies.jar /ncdc/ /max_temperature_out/

成功执行时的日志打印：

 -- ::, INFO client.RMProxy: Connecting to ResourceManager at localhost/127.0.0.1:

 -- ::, WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.

 -- ::, INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/hadoopuser/.staging/job_1568096329203_0001

 -- ::, INFO input.FileInputFormat: Total input files to process :

 -- ::, INFO mapreduce.JobSubmitter: number of splits:

 -- ::, INFO Configuration.deprecation: yarn.resourcemanager.system-metrics-publisher.enabled is deprecated. Instead, use yarn.system-metrics-publisher.enabled

 -- ::, INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1568096329203_0001

 -- ::, INFO mapreduce.JobSubmitter: Executing with tokens: []

 -- ::, INFO conf.Configuration: resource-types.xml not found

 -- ::, INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.

 -- ::, INFO impl.YarnClientImpl: Submitted application application_1568096329203_0001

 -- ::, INFO mapreduce.Job: The url to track the job: http://slave1:8088/proxy/application_1568096329203_0001/

 -- ::, INFO mapreduce.Job: Running job: job_1568096329203_0001

 -- ::, INFO mapreduce.Job: Job job_1568096329203_0001 running in uber mode : false

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job: Job job_1568096329203_0001 completed successfully

 -- ::, INFO mapreduce.Job: Counters:

         File System Counters

                 FILE: Number of bytes read=

                 FILE: Number of bytes written=

                 FILE: Number of read operations=

                 FILE: Number of large read operations=

                 FILE: Number of write operations=

                 HDFS: Number of bytes read=

                 HDFS: Number of bytes written=

                 HDFS: Number of read operations=

                 HDFS: Number of large read operations=

                 HDFS: Number of write operations=

                 HDFS: Number of bytes read erasure-coded=

         Job Counters

                 Launched map tasks=

                 Launched reduce tasks=

                 Data-local map tasks=

                 Total time spent by all maps in occupied slots (ms)=

                 Total time spent by all reduces in occupied slots (ms)=

                 Total time spent by all map tasks (ms)=

                 Total time spent by all reduce tasks (ms)=

                 Total vcore-milliseconds taken by all map tasks=

                 Total vcore-milliseconds taken by all reduce tasks=

                 Total megabyte-milliseconds taken by all map tasks=

                 Total megabyte-milliseconds taken by all reduce tasks=

         Map-Reduce Framework

                 Map input records=

                 Map output records=

                 Map output bytes=

                 Map output materialized bytes=

                 Input split bytes=

                 Combine input records=

                 Combine output records=

                 Reduce input groups=

                 Reduce shuffle bytes=

                 Reduce input records=

                 Reduce output records=

                 Spilled Records=

                 Shuffled Maps =

                 Failed Shuffles=

                 Merged Map outputs=

                 GC time elapsed (ms)=

                 CPU time spent (ms)=

                 Physical memory (bytes) snapshot=

                 Virtual memory (bytes) snapshot=

                 Total committed heap usage (bytes)=

                 Peak Map Physical memory (bytes)=

                 Peak Map Virtual memory (bytes)=

                 Peak Reduce Physical memory (bytes)=

                 Peak Reduce Virtual memory (bytes)=

         Shuffle Errors

                 BAD_ID=

                 CONNECTION=

                 IO_ERROR=

                 WRONG_LENGTH=

                 WRONG_MAP=

                 WRONG_REDUCE=

         File Input Format Counters

                 Bytes Read=

         File Output Format Counters

                 Bytes Written=

接下来逐行理解日志含义：

-- ::, INFO client.RMProxy: Connecting to ResourceManager at localhost/127.0.0.1:

#emm 我要先找到资源管理器

-- ::, WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.

#ohoh，代码里用的运行方式过时了，推荐使用继承Tool接口的方式来实现它

-- ::, INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/hadoopuser/.staging/job_1568096329203_0001

-- ::, INFO input.FileInputFormat: Total input files to process :

#输入文件数 2

-- ::, INFO mapreduce.JobSubmitter: number of splits:

#分片数2（与输入块数出奇的一致呢~）

-- ::, INFO Configuration.deprecation: yarn.resourcemanager.system-metrics-publisher.enabled is deprecated. Instead, use yarn.system-metrics-publisher.enabled

-- ::, INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1568096329203_0001

#提交作业token，终于看到作业命名规则了，大概是job_<时间戳>_<4位数的递增序号>

-- ::, INFO mapreduce.JobSubmitter: Executing with tokens: []

-- ::, INFO conf.Configuration: resource-types.xml not found

-- ::, INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.

-- ::, INFO impl.YarnClientImpl: Submitted application application_1568096329203_0001

#似乎是提交作业到Yarn了

-- ::, INFO mapreduce.Job: The url to track the job: http://slave1:8088/proxy/application_1568096329203_0001/

#em，提供了一个追踪作业执行进度的url

-- ::, INFO mapreduce.Job: Running job: job_1568096329203_0001

-- ::, INFO mapreduce.Job: Job job_1568096329203_0001 running in uber mode : false

#作业不满足uber作业的条件，将以非uber模式执行

-- ::, INFO mapreduce.Job:  map % reduce %

-- ::, INFO mapreduce.Job:  map % reduce %

-- ::, INFO mapreduce.Job:  map % reduce %

-- ::, INFO mapreduce.Job: Job job_1568096329203_0001 completed successfully

#em 作业执行成功了

-- ::, INFO mapreduce.Job: Counters:

#下面是作业的一些计数器数据

        File System Counters

                FILE: Number of bytes read=

                FILE: Number of bytes written=

                FILE: Number of read operations=

                FILE: Number of large read operations=

                FILE: Number of write operations=

                HDFS: Number of bytes read=

                HDFS: Number of bytes written=

                HDFS: Number of read operations=

                HDFS: Number of large read operations=

                HDFS: Number of write operations=

                HDFS: Number of bytes read erasure-coded=

        Job Counters

                Launched map tasks=

                Launched reduce tasks=

                Data-local map tasks=

                Total time spent by all maps in occupied slots (ms)=

                Total time spent by all reduces in occupied slots (ms)=

                Total time spent by all map tasks (ms)=

                Total time spent by all reduce tasks (ms)=

                Total vcore-milliseconds taken by all map tasks=

                Total vcore-milliseconds taken by all reduce tasks=

                Total megabyte-milliseconds taken by all map tasks=

                Total megabyte-milliseconds taken by all reduce tasks=

        Map-Reduce Framework

                Map input records=

                Map output records=

                Map output bytes=

                Map output materialized bytes=

                Input split bytes=

                Combine input records=

                Combine output records=

                Reduce input groups=

                Reduce shuffle bytes=

                Reduce input records=

                Reduce output records=

                Spilled Records=

                Shuffled Maps =

                Failed Shuffles=

                Merged Map outputs=

                GC time elapsed (ms)=

                CPU time spent (ms)=

                Physical memory (bytes) snapshot=

                Virtual memory (bytes) snapshot=

                Total committed heap usage (bytes)=

                Peak Map Physical memory (bytes)=

                Peak Map Virtual memory (bytes)=

                Peak Reduce Physical memory (bytes)=

                Peak Reduce Virtual memory (bytes)=

        Shuffle Errors

                BAD_ID=

                CONNECTION=

                IO_ERROR=

                WRONG_LENGTH=

                WRONG_MAP=

                WRONG_REDUCE=

        File Input Format Counters

                Bytes Read=

        File Output Format Counters

                Bytes Written=

再看看我们指定的输出目录下到底输出了什么： Hadoop学习之第一个MapReduce程序

可以看到，输出目录包含两个文件，其中名为_SUCCESS的空文件标识作业执行成功，名为part-r-00000的文件记录Reduce任务的输出。查看part-r-00000文件内容：
Hadoop学习之第一个MapReduce程序

可以看到，顺利得出了1901年和1902年的最高气温分别是317和244。
这里我们可以通过非mapreduce的方式去计算一下最高气温，以验证这个mapred程序的气温计算是否正确，但由于这不是我们的重点，而且程序逻辑比较简单，姑且认为至结果可信。

猜想

基于输入输入和程序执行过程中的日志打印，我们做如下猜想：
猜想1、分片数等于输入的块数
猜想2、计数器中File Input Format Counters等于读入的文件总字节数
猜想3、计数器中File Output Format Counters等于写入输出目录的总字节数

验证猜想

在搭建的分布式环境中运行以上代码，以验证猜想。分布式环境中准备好的输入文件如下： Hadoop学习之第一个MapReduce程序

即这个集群中准备了44个年份的数据文件，总大小为3137231401字节，分成了61个数据块。

下面开始提交执行作业：

yarn jar ~/max-temperature-1.0-SNAPSHOT-jar-with-dependencies.jar /ncdc/raw/ /max_temperature_out/

作业提交和执行并非一帆风顺，期间遇到过很多问题，比较典型的是java.net.NoRouteToHostException，对这个错误的解决过程记录在《Hadoop学习问题记录之基础篇》的问题一。

解决遇到的问题后，成功运行的日志打印如下：

 [hadoop_user@master hadoop-3.2.]$ yarn jar ~/max-temperature-1.0-SNAPSHOT-jar-with-dependencies.jar /ncdc/raw/ /max_out

 -- ::, INFO client.RMProxy: Connecting to ResourceManager at master/192.168.212.132:

 -- ::, WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.

 -- ::, INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/hadoop_user/.staging/job_1568605271327_0002

 -- ::, INFO input.FileInputFormat: Total input files to process :

 -- ::, INFO mapreduce.JobSubmitter: number of splits:

 -- ::, INFO Configuration.deprecation: yarn.resourcemanager.system-metrics-publisher.enabled is deprecated. Instead, use yarn.system-metrics-publisher.enabled

 -- ::, INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1568605271327_0002

 -- ::, INFO mapreduce.JobSubmitter: Executing with tokens: []

 -- ::, INFO conf.Configuration: resource-types.xml not found

 -- ::, INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.

 -- ::, INFO impl.YarnClientImpl: Submitted application application_1568605271327_0002

 -- ::, INFO mapreduce.Job: The url to track the job: http://master:8088/proxy/application_1568605271327_0002/

 -- ::, INFO mapreduce.Job: Running job: job_1568605271327_0002

 -- ::, INFO mapreduce.Job: Job job_1568605271327_0002 running in uber mode : false

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job:  map % reduce %

 -- ::, INFO mapreduce.Job: Job job_1568605271327_0002 completed successfully

 -- ::, INFO mapreduce.Job: Counters:

         File System Counters

                 FILE: Number of bytes read=

                 FILE: Number of bytes written=

                 FILE: Number of read operations=

                 FILE: Number of large read operations=

                 FILE: Number of write operations=

                 HDFS: Number of bytes read=

                 HDFS: Number of bytes written=

                 HDFS: Number of read operations=

                 HDFS: Number of large read operations=

                 HDFS: Number of write operations=

                 HDFS: Number of bytes read erasure-coded=

         Job Counters

                 Killed map tasks=

                 Launched map tasks=

                 Launched reduce tasks=

                 Data-local map tasks=

                 Rack-local map tasks=

                 Total time spent by all maps in occupied slots (ms)=

                 Total time spent by all reduces in occupied slots (ms)=

                 Total time spent by all map tasks (ms)=

                 Total time spent by all reduce tasks (ms)=

                 Total vcore-milliseconds taken by all map tasks=

                 Total vcore-milliseconds taken by all reduce tasks=

                 Total megabyte-milliseconds taken by all map tasks=

                 Total megabyte-milliseconds taken by all reduce tasks=

         Map-Reduce Framework

                 Map input records=

                 Map output records=

                 Map output bytes=

                 Map output materialized bytes=

                 Input split bytes=

                 Combine input records=

                 Combine output records=

                 Reduce input groups=

                 Reduce shuffle bytes=

                 Reduce input records=

                 Reduce output records=

                 Spilled Records=

                 Shuffled Maps =

                 Failed Shuffles=

                 Merged Map outputs=

                 GC time elapsed (ms)=

                 CPU time spent (ms)=

                 Physical memory (bytes) snapshot=

                 Virtual memory (bytes) snapshot=

                 Total committed heap usage (bytes)=

                 Peak Map Physical memory (bytes)=

                 Peak Map Virtual memory (bytes)=

                 Peak Reduce Physical memory (bytes)=

                 Peak Reduce Virtual memory (bytes)=

         Shuffle Errors

                 BAD_ID=

                 CONNECTION=

                 IO_ERROR=

                 WRONG_LENGTH=

                 WRONG_MAP=

                 WRONG_REDUCE=

         File Input Format Counters

                 Bytes Read=

         File Output Format Counters

                 Bytes Written=

通过日志可以得出
猜想1错误，输入分片数是58，并不等于输入文件的块数61。
猜想2错误，File Input Format Counters并不总是等于实际输入文件字节数，这里mapred任务统计的读入字节数为3137288745，略大于fsck工具检测出的字节数3137231401。
猜想3正确，File Output Format Counters等于实际输出的part-r-*的文件总字节数。

结论

分片数不会一定等于输入块数。
File Input Format Conters并不总等于fsck工具检测出的输入文件总字节数。
File Output Format Conters会等于实际输出的所有part-r-*文件总字节数

ps1：以上结论只是在处理文本输入的情况下得出的，对于其它类型的InputFormat是否会违背上述结论，这里先打个问号。

ps2：关于分片数的问题，将另起篇幅叙述，这里暂且不话。见这里。

秒客网

Hadoop学习之第一个MapReduce程序

期望

执行前

程序代码

数据准备

验证执行

猜想

验证猜想

结论

相关文章