wordcount代碼實現詳解
阿新 • • 發佈:2017-07-13
常量 內部 t對象 mit sta see .org 系統配置 ioe
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.examples;
//導入必要的package
import java.io.IOException; //報錯類
import java.util.StringTokenizer; //StringTokenizer類,用於將空白字符作為分割符的類
import org.apache.hadoop.conf.Configuration;//Hadoop中用於讀取配置信息的類
import org.apache.hadoop.fs.Path; //有關文件系統輸入輸出數據的類
import org.apache.hadoop.io.IntWritable; //封裝定義了IntWritable類
import org.apache.hadoop.io.Text; //封裝定義了Text類
import org.apache.hadoop.mapreduce.Job; //封裝定義了Job類
import org.apache.hadoop.mapreduce.Mapper; //封裝定義了Mapper類
import org.apache.hadoop.mapreduce.Reducer; //封裝定義了Reducer類
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; //文件輸入要用到的類
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; //文件輸出要用到的類
import org.apache.hadoop.util.GenericOptionsParser; //GenericOptionsParser類,用來解釋常用hadoop命令,並根據需要為Configuration對象設置相應的值
public class WordCount {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{ //自定義的TokenizerMapper類,繼承自前面導入的Mapper類
private final static IntWritable one = new IntWritable(1); //實例化了一個IntWritable類的one對象並賦值為常量1
private Text word = new Text(); //實例化了一個Text類的對象word
public void map(Object key, Text value, Context context //定義Map方法
) throws IOException, InterruptedException {
//這裏說一下context類,它是Mapper的一個內部類,它用來與MapReduce系統進行通信,如把map的結果傳給reduce處理。簡單的說頂級接口用它在map或是reduce任務中跟蹤task的狀態,MapContext就是記錄了map執行的上下文,在mapper類中,這個context可以存儲一些job conf的信息,同時context作為了map和reduce執行中各個函數的一個橋梁,我們可以在map函數中處理這個信息
StringTokenizer itr = new StringTokenizer(value.toString());//實例化了一個以空白字符為分隔符的StringTokenizer類的對象itr
while (itr.hasMoreTokens()) {//如果判斷還有下一個分隔符(空格)
word.set(itr.nextToken()); //則輸出並返回之間的字符串給word
context.write(word, one); //context.write方法將(word,1)這樣的二元組存入context中
}
}
}
public static class IntSumReducer //自定義的IntSumReducer類,繼承自前面導入的Reducer類
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable(); //實例化了一個IntWritable類的result對象
public void reduce(Text key, Iterable<IntWritable> values,Context context//定義Reduce方法,這裏叠代器(Iterator)是一種設計模式,它是一個對象,它可以遍歷並選擇序列(IntWritable)中的對象,而開發人員不需要了解該序列的底層結構。
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();//將該詞的出現次數相加
}
result.set(sum);//將sum賦給result
context.write(key, result);//輸出最終結果
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//運行MapReduce程序前都要初始化Configuration,該類主要是讀取MapReduce系統配置信息,這些信息包括hdfs還有MapReduce,也就是安裝hadoop時候的配置文件例如:core-site.xml、hdfs-site.xml和mapred-site.xml等等文件裏的信息,有些童鞋不理解為啥要這麽做,這個是沒有深入思考MapReduce計算框架造成,我們程序員開發MapReduce時候只是在填空,在map函數和reduce函數裏編寫實際進行的業務邏輯,其它的工作都是交給MapReduce框架自己操作的,但是至少我們要告訴它怎麽操作啊,比如hdfs在哪裏,MapReduce的jobstracker在哪裏,而這些信息就在conf包下的配置文件裏。
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length < 2) {
System.err.println("Usage: wordcount <in> [<in>...] <out>");
System.exit(2);
}//If的語句好理解,就是運行WordCount程序時候一定是兩個參數,如果不是就會報錯退出。至於第一句裏的GenericOptionsParser類,它是用來解釋常用hadoop命令,並根據需要為Configuration對象設置相應的值
Job job = Job.getInstance(conf, "word count");//用Job.getInstance方法設置作業名為word count
job.setJarByClass(WordCount.class); //為job的輸出數據設置Key類
job.setMapperClass(TokenizerMapper.class); //設置Mapper類(Map階段使用)
job.setCombinerClass(IntSumReducer.class); //設置Combiner類(中間合並結果)
job.setReducerClass(IntSumReducer.class); //設置Reducer類(Reduce階段使用)
job.setOutputKeyClass(Text.class); //為job的輸出數據設置Key類,規定Reduce輸出的Key類型為Text
job.setOutputValueClass(IntWritable.class); //設置Reduce輸出的Value類型為IntWritable
for (int i = 0; i < otherArgs.length - 1; ++i) { //設置輸入輸出路徑
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
FileOutputFormat.setOutputPath(job,
new Path(otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);//等待任務執行完畢退出
}
}
wordcount代碼實現詳解