1. 程式人生 > >wordcount代碼實現詳解

wordcount代碼實現詳解

常量 內部 t對象 mit sta see .org 系統配置 ioe

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.examples;
//導入必要的package
import java.io.IOException;        //報錯類                                       
import java.util.StringTokenizer;  //StringTokenizer類,用於將空白字符作為分割符的類

import org.apache.hadoop.conf.Configuration;//Hadoop中用於讀取配置信息的類
import org.apache.hadoop.fs.Path;           //有關文件系統輸入輸出數據的類
import org.apache.hadoop.io.IntWritable;    //封裝定義了IntWritable類
import org.apache.hadoop.io.Text;           //封裝定義了Text類
import org.apache.hadoop.mapreduce.Job;     //封裝定義了Job類
import org.apache.hadoop.mapreduce.Mapper;  //封裝定義了Mapper類
import org.apache.hadoop.mapreduce.Reducer; //封裝定義了Reducer類
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;   //文件輸入要用到的類
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; //文件輸出要用到的類
import org.apache.hadoop.util.GenericOptionsParser;             //GenericOptionsParser類,用來解釋常用hadoop命令,並根據需要為Configuration對象設置相應的值

public class WordCount {

  public static class TokenizerMapper 
       extends Mapper<Object, Text, Text, IntWritable>{         //自定義的TokenizerMapper類,繼承自前面導入的Mapper類
    
    private final static IntWritable one = new IntWritable(1);  //實例化了一個IntWritable類的one對象並賦值為常量1
    private Text word = new Text();                             //實例化了一個Text類的對象word
      
    public void map(Object key, Text value, Context context     //定義Map方法
                    ) throws IOException, InterruptedException {

//這裏說一下context類,它是Mapper的一個內部類,它用來與MapReduce系統進行通信,如把map的結果傳給reduce處理。簡單的說頂級接口用它在map或是reduce任務中跟蹤task的狀態,MapContext就是記錄了map執行的上下文,在mapper類中,這個context可以存儲一些job conf的信息,同時context作為了map和reduce執行中各個函數的一個橋梁,我們可以在map函數中處理這個信息

      StringTokenizer itr = new StringTokenizer(value.toString());//實例化了一個以空白字符為分隔符的StringTokenizer類的對象itr
      while (itr.hasMoreTokens()) {//如果判斷還有下一個分隔符(空格)
        word.set(itr.nextToken()); //則輸出並返回之間的字符串給word
        context.write(word, one);  //context.write方法將(word,1)這樣的二元組存入context中
      }
    }
  }
  
  public static class IntSumReducer                           //自定義的IntSumReducer類,繼承自前面導入的Reducer類                             
       extends Reducer<Text,IntWritable,Text,IntWritable> {
    private IntWritable result = new IntWritable();           //實例化了一個IntWritable類的result對象

    public void reduce(Text key, Iterable<IntWritable> values,Context context//定義Reduce方法,這裏叠代器(Iterator)是一種設計模式,它是一個對象,它可以遍歷並選擇序列(IntWritable)中的對象,而開發人員不需要了解該序列的底層結構。
                       ) throws IOException, InterruptedException {
      int sum = 0;
      for (IntWritable val : values) {
        sum += val.get();//將該詞的出現次數相加
      }
      result.set(sum);//將sum賦給result
      context.write(key, result);//輸出最終結果
    }
  }

  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
  //運行MapReduce程序前都要初始化Configuration,該類主要是讀取MapReduce系統配置信息,這些信息包括hdfs還有MapReduce,也就是安裝hadoop時候的配置文件例如:core-site.xml、hdfs-site.xml和mapred-site.xml等等文件裏的信息,有些童鞋不理解為啥要這麽做,這個是沒有深入思考MapReduce計算框架造成,我們程序員開發MapReduce時候只是在填空,在map函數和reduce函數裏編寫實際進行的業務邏輯,其它的工作都是交給MapReduce框架自己操作的,但是至少我們要告訴它怎麽操作啊,比如hdfs在哪裏,MapReduce的jobstracker在哪裏,而這些信息就在conf包下的配置文件裏。

    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 2) {
      System.err.println("Usage: wordcount <in> [<in>...] <out>");
      System.exit(2);
    }//If的語句好理解,就是運行WordCount程序時候一定是兩個參數,如果不是就會報錯退出。至於第一句裏的GenericOptionsParser類,它是用來解釋常用hadoop命令,並根據需要為Configuration對象設置相應的值
    Job job = Job.getInstance(conf, "word count");//用Job.getInstance方法設置作業名為word count
    job.setJarByClass(WordCount.class);           //為job的輸出數據設置Key類
    job.setMapperClass(TokenizerMapper.class);    //設置Mapper類(Map階段使用)
    job.setCombinerClass(IntSumReducer.class);    //設置Combiner類(中間合並結果)
    job.setReducerClass(IntSumReducer.class);     //設置Reducer類(Reduce階段使用)
    job.setOutputKeyClass(Text.class);            //為job的輸出數據設置Key類,規定Reduce輸出的Key類型為Text
    job.setOutputValueClass(IntWritable.class);   //設置Reduce輸出的Value類型為IntWritable
    
    for (int i = 0; i < otherArgs.length - 1; ++i) { //設置輸入輸出路徑      
      FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
    }
    FileOutputFormat.setOutputPath(job,
      new Path(otherArgs[otherArgs.length - 1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);//等待任務執行完畢退出
  }
}

wordcount代碼實現詳解