Hadoop2.x實戰：WordCount、Sort、去重複、average例項MapRedure編寫

阿新 • • 發佈：2019-02-20

Hadoop版本：2.6.0

Eclipse版本：luna

一、 Hadoop做的一個計算單詞的例項

1、引入jar

	<dependencies>
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-common</artifactId>
			<version>2.2.0</version>
		</dependency>
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-hdfs</artifactId>
			<version>2.2.0</version>
		</dependency>
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-client</artifactId>
			<version>2.2.0</version>
		</dependency>
		<dependency>
			<groupId>jdk.tools</groupId>
			<artifactId>jdk.tools</artifactId>
			<version>1.6</version>
			<scope>system</scope>
			<systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>
		</dependency>
	</dependencies>

2、程式碼編寫

package com.lin.wordcount;

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
 
public class WordCount {
 
    public static class WordCountMapper extends MapReduceBase implements Mapper<Object, Text, Text, IntWritable> {
        private final static IntWritable one =new IntWritable(1);
        private Text word =new Text();
 
        public void map(Object key,Text value,OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
            StringTokenizer itr = new StringTokenizer(value.toString());
            while(itr.hasMoreTokens()) {
                word.set(itr.nextToken());
                output.collect(word,one);//字元解析成key-value,然後再發給reducer
            }
 
        }
    }
 
    public static class WordCountReducer extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
        private IntWritable result =new IntWritable();
 
        public void reduce(Text key, Iterator<IntWritable>values, OutputCollector<Text, IntWritable> output, Reporter reporter)throws IOException {
            int sum = 0;
            while (values.hasNext()){//key相同的map會被髮送到同一個reducer,所以通過迴圈來累加
                sum +=values.next().get();
            }
            result.set(sum);
            output.collect(key, result);//結果寫到hdfs
        }
      
    }
 
    public static void main(String[] args)throws Exception {
    	//System.setProperty("hadoop.home.dir", "D:\\project\\hadoop-2.7.2"); 如果本地環境變數沒有設定hadoop路徑可以這麼做
    	
        String input = "hdfs://hmaster:9000/input/LICENSE.txt";
        String output = "hdfs://hmaster:9000/output/";
 
        JobConf conf = new JobConf(WordCount.class);
        conf.setJobName("WordCount");
        //方法一設定連線引數
        conf.addResource("classpath:/hadoop2/core-site.xml");
        conf.addResource("classpath:/hadoop2/hdfs-site.xml");
        conf.addResource("classpath:/hadoop2/mapred-site.xml");
        conf.addResource("classpath:/hadoop2/yarn-site.xml");
       //方法二設定連線引數
       //conf.set("mapred.job.tracker", "10.75.201.125:9000");
 
        conf.setOutputKeyClass(Text.class);//設定輸出key格式
        conf.setOutputValueClass(IntWritable.class);//設定輸出value格式
 
       conf.setMapperClass(WordCountMapper.class);//設定Map運算元
       conf.setCombinerClass(WordCountReducer.class);//設定Combine運算元
       conf.setReducerClass(WordCountReducer.class);//設定reduce運算元
 
       conf.setInputFormat(TextInputFormat.class);//設定輸入格式
       conf.setOutputFormat(TextOutputFormat.class);//設定輸出格式
 
       FileInputFormat.setInputPaths(conf,new Path(input));//設定輸入路徑
       FileOutputFormat.setOutputPath(conf,new Path(output));//設定輸出路徑
 
       JobClient.runJob(conf);
       System.exit(0);
    }
 
}

3、輸出結果：

最終輸出：

二、Sort排序例項

原始碼：

package com.lin.sort;

/**
 * 功能概要：資料排序
 * 
 * @author linbingwen
 * @since  2016年6月30日 
 */

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
 
public class Sort {
 
    //map將輸入中的value化成IntWritable型別，作為輸出的key
    public static class Map extends Mapper<Object,Text,IntWritable,IntWritable> {
        private static IntWritable data=new IntWritable();
       
        //實現map函式
        public void map(Object key,Text value,Context context) throws IOException,InterruptedException{
            String line=value.toString();
            data.set(Integer.parseInt(line));
            context.write(data, new IntWritable(1));
        }
       
    }
   
    //reduce將輸入中的key複製到輸出資料的key上，
    //然後根據輸入的value-list中元素的個數決定key的輸出次數
    //用全域性linenum來代表key的位次
    public static class Reduce extends  Reducer<IntWritable,IntWritable,IntWritable,IntWritable> {
       
        private static IntWritable linenum = new IntWritable(1);
       
        //實現reduce函式
        public void reduce(IntWritable key,Iterable<IntWritable> values,Context context) throws IOException,InterruptedException {
            for(IntWritable val:values){
                context.write(linenum, key);
                linenum = new IntWritable(linenum.get()+1);
            }
           
        }
 
    }
   
    public static void main(String[] args) throws Exception{
        Configuration conf = new Configuration();
        //這句話很關鍵
//        conf.set("mapred.job.tracker", "192.168.1.2:9001");
        conf.addResource("classpath:/hadoop2/core-site.xml");
        conf.addResource("classpath:/hadoop2/hdfs-site.xml");
        conf.addResource("classpath:/hadoop2/mapred-site.xml");
        conf.addResource("classpath:/hadoop2/yarn-site.xml");
       
        String[] ioArgs=new String[]{"hdfs://hmaster:9000/sort_in","hdfs://hmaster:9000/sort_out"};
        String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs();
        if (otherArgs.length != 2) {
        System.err.println("Usage: Data Sort <in> <out>");
         System.exit(2);
      }
     
     Job job = Job.getInstance(conf, "Data Sort");
     job.setJarByClass(Sort.class);
     
     //設定Map和Reduce處理類
     job.setMapperClass(Map.class);
     job.setReducerClass(Reduce.class);
     
     //設定輸出型別
     job.setOutputKeyClass(IntWritable.class);
     job.setOutputValueClass(IntWritable.class);
     
     //設定輸入和輸出目錄
     FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
     FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
     System.exit(job.waitForCompletion(true) ? 0 : 1);
     }
}

輸入檔案：

file1.txt

file2.txt

file3.txt

26
54
6

執行結果：

輸入輸出：

下面是在hadoop的安裝機器上看的結果

三、去重例項

package com.lin.diffdata;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import com.lin.wordcount.WordCount;

/**
 * 功能概要：資料去重複
 * 
 * @author linbingwen
 * @since 2016年6月28日
 */
public class DiffData {

	// map將輸入中的value複製到輸出資料的key上，並直接輸出
	public static class Map extends Mapper<Object, Text, Text, Text> {
		private static Text line = new Text();// 每行資料

		// 實現map函式
		public void map(Object key, Text value, Context context)
				throws IOException, InterruptedException {
			line = value;
			context.write(line, new Text(""));
		}

	}

	// reduce將輸入中的key複製到輸出資料的key上，並直接輸出
	public static class Reduce extends Reducer<Text, Text, Text, Text> {
		// 實現reduce函式
		public void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			context.write(key, new Text(""));
		}

	}

	public static void main(String[] args) throws Exception {
		
        JobConf conf = new JobConf(DiffData.class);
        conf.setJobName("WordCount");
        conf.addResource("classpath:/hadoop2/core-site.xml");
        conf.addResource("classpath:/hadoop2/hdfs-site.xml");
        conf.addResource("classpath:/hadoop2/mapred-site.xml");
        conf.addResource("classpath:/hadoop2/yarn-site.xml");

		String[] ioArgs = new String[] { "hdfs://hmaster:9000/input", "hdfs://hmaster:9000/output" };
		String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs();
		if (otherArgs.length != 2) {
			System.err.println("Usage: Data Deduplication <in> <out>");
			System.exit(2);
		}

		Job job =  Job.getInstance(conf, "Data Deduplication");
		job.setJarByClass(DiffData.class);

		// 設定Map、Combine和Reduce處理類
		job.setMapperClass(Map.class);
		job.setCombinerClass(Reduce.class);
		job.setReducerClass(Reduce.class);

		// 設定輸出型別
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);

		// 設定輸入和輸出目錄
		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
}

執行輸出結果：

最終結果：

其中輸入

file1.txt

2012-3-1 a
2012-3-2 b
2012-3-3 c
2012-3-4 d
2012-3-5 a
2012-3-6 b
2012-3-7 c
2012-3-3 c

file2.txt

2012-3-1 b
2012-3-2 a
2012-3-3 b
2012-3-4 d
2012-3-5 a
2012-3-6 c
2012-3-7 d
2012-3-3 c

hadoop安裝機器上檢視結果

四、求平均數

package com.lin.average;

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class Average {

	public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
		// 實現map函式
		public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			// 將輸入的純文字檔案的資料轉化成String
			String line = value.toString();
			// 將輸入的資料首先按行進行分割
			StringTokenizer tokenizerArticle = new StringTokenizer(line, "\n");
			// 分別對每一行進行處理
			while (tokenizerArticle.hasMoreElements()) {
				// 每行按空格劃分
				StringTokenizer tokenizerLine = new StringTokenizer(tokenizerArticle.nextToken());
				String strName = tokenizerLine.nextToken();// 學生姓名部分
				String strScore = tokenizerLine.nextToken();// 成績部分
				Text name = new Text(strName);
				int scoreInt = Integer.parseInt(strScore);
				// 輸出姓名和成績
				context.write(name, new IntWritable(scoreInt));
			}
		}

	}

	public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
		// 實現reduce函式
		public void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
			int sum = 0;
			int count = 0;
			Iterator<IntWritable> iterator = values.iterator();
			while (iterator.hasNext()) {
				sum += iterator.next().get();// 計算總分
				count++;// 統計總的科目數
			}
			int average = (int) sum / count;// 計算平均成績
			context.write(key, new IntWritable(average));
		}
	}

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		//設定hadoop的機器、埠
		conf.set("mapred.job.tracker", "10.75.201.125:9000");
		//設定輸入輸出檔案目錄
		String[] ioArgs = new String[] { "hdfs://hmaster:9000/average_in", "hdfs://hmaster:9000/average_out" };
		String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs();
		if (otherArgs.length != 2) {
			System.err.println("Usage: Score Average <in> <out>");
			System.exit(2);
		}
		//設定一個job
		Job job = Job.getInstance(conf, "Score Average");
		
		//去除重複的輸出資料夾
//        FileSystem fs = FileSystem.get(conf);
//        Path out = new Path(otherArgs[1]);
//        if (fs.exists(out)){
//            fs.delete(out, true);
//        }
		
		job.setJarByClass(Average.class);
		
		// 設定Map、Combine和Reduce處理類
		job.setMapperClass(Map.class);
		job.setCombinerClass(Reduce.class);
		job.setReducerClass(Reduce.class);
		
		// 設定輸出型別
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		// 將輸入的資料集分割成小資料塊splites，提供一個RecordReder的實現
		job.setInputFormatClass(TextInputFormat.class);
		
		// 提供一個RecordWriter的實現，負責資料輸出
		job.setOutputFormatClass(TextOutputFormat.class);
		
		// 設定輸入和輸出目錄
		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);

	}

}

執行結果：

下面輸入file1.txt

張三    88
李四    99
王五    66
趙六    77

file2.txt

張三    78
李四    89
王五    96
趙六    67

file3.txt

張三    80
李四    82
王五    84
趙六    86

Hadoop2.x實戰：WordCount、Sort、去重複、average例項MapRedure編寫

Hadoop版本：2.6.0Eclipse版本：luna一、 Hadoop做的一個計算單詞的例項1、引入jar <dependencies> <dependency> <groupId>org.apache.hadoop&

Hadoop2.x實戰：WordCount例項執行

摘要：本文主要講了hadoop2.7.2自帶的WordCount例項執行作業系統:Ubuntu14.04 hadoop版本：2.7.2例項測試wordCount1、啟動hadoop/usr/hadoop/hadoop-2.7.2/sbin .

企業實戰：mysql5.6數據庫備份、恢復腳本

ash path bsp mat fun fin sql wait user 一、mysql數據庫備份腳本備份腳本1：[root@szxelab-a-back-12 scripts]# vim elab_fenku_backup.sh #!/bin/sh #create a

前端每日實戰：136# 視訊演示如何用 CSS、D3 和 GSAP 創作一個橫條 loader

效果預覽按下右側的“點選預覽”按鈕可以在當前頁面預覽，點選連結可以全屏預覽。可互動視訊此視訊是可以互動的，你可以隨時暫停視訊，編輯視訊中的程式碼。請用 chrome, safari, edge 開啟觀看。原始碼下載每日前端實戰系列的全部原始碼請從

hadoop2.x入門：編寫mapreduce對氣象資料集求每日最高氣溫和最低氣溫

1.下載氣象資料集我們下載國內的氣象資料，使用下面命令進行下載 wget -D --accept-regex=REGEX -P data -r -c ftp://ftp.ncdc.noaa.gov/pub/data/noaa/2017/5*

List集合去重的一些方法（常規遍歷、Set去重、java8 stream去重、重寫equals和hashCode方法）

利用 src false java8 see eat 基本 style ceo 1. 常規元素去重碰到List去重的問題，除了遍歷去重，我們常常想到利用Set集合不允許重復元素的特點，通過List和Set互轉，來去掉重復元素。 // 遍歷後判斷賦給另一個list集

linq指定列查詢、去重複、排序、top行

_cnt.DLX_NewsSendTo .Where(x => x.NewsId == nId && x.SendType == 2) .Select(s => new { s.SendToCategory, s.SendTo }) .Distinct

Hadoop2.x新特性：HA、Federation、快照

NameNode HA（1）基於NFS共享儲存解決方案（2）基於Qurom Journal Manager（QJM）解決方案NameNode Federation（1）存在多個NameNode，每個NameNode分管一部分目錄（2） NameNode共用DataNode 一、HDF

Shell基礎：shell變量、環境變量配置文件、shell特殊符_cut、sort、wc、uniq

逆轉重定向空格 9.png pro 終端 split 打開 style shell變量1.命令 env 列出系統預設的全部系統變量（變量都是大寫字母）2.命令 set 可以查看到系統預設的變量，也可以看到用戶自定義的變量。3.自定義變量：變

實戰：區塊鏈hyperledger fabric 初體驗 - 3：鏈碼實例安裝、實例化、調用及代碼

區塊鏈 hyperledger fabric blockchain 本文鏈碼實例為Fabric 官方實例examples/chaincode/go/chaincode_example02，實現簡單的轉賬功能進入到cli容器裏面$ docker exec -it fabric-cli bash１

分布式爬蟲系統設計、實現與實戰：爬取京東、蘇寧易購全網手機商品數據+MySQL、HBase存儲

大數據分布式爬蟲 Java Redis [TOC] 1 概述在不用爬蟲框架的情況，經過多方學習，嘗試實現了一個分布式爬蟲系統，並且可以將數據保存到不同地方，類似MySQL、HBase等。基於面向接口的編碼思想來開發，因此這個系統具有一定的擴展性，有興趣的朋友直接看一下代碼，就能理

python：數組/列表（remove()函數、append()函數、sort()函數、reverse()函數）

process %s .py 3*3 IT bsp 長度 num list 排序： 1：整理順序 #冒泡 lista = [5,7,11,19,99,63,3,9,1] list = [] while lista != []: number = 0 for

實戰經驗：抖音怎麽刷贊、抖音怎麽刷播放、抖音怎麽樣容易熱門技巧

實戰經驗：抖音怎麽刷贊、抖音怎麽刷播放、抖音怎麽樣容易熱門技巧今年春天，抖音突然火了。百度數據顯示，從今年3月以來，抖音的數據開始飆升。媒體對抖音的報道越來越多。目前抖音日均 VV已過億，DAU數百萬量級。這個由今日頭條孵化的音樂短視頻成了2017年最火的APP之一。抖音提供“首頁精選”和“熱門

python作業/練習/實戰：1、簡單登錄腳本

lan href else 次數成功 welcom lse 學習教程作業要求寫一個登陸的小程序 username = xiaoming passwd = 123456 1、輸入賬號密碼,輸入正確就登陸成功，　　提示：歡迎xxxx登陸，今天的日

python作業/練習/實戰：3、實現商品管理的一個程序

pan utf python文件學習筆記要求 span 定義 add utf-8 作業要求實現一個商品管理的一個程序，運行程序有三個選項，輸入1添加商品；輸入2刪除商品；輸入3 查看商品信息1、添加商品：商品名稱：xx 商品如果已經存在，提示商品已存在

二、Kafka基礎實戰：消費者和生產者實例

消費者 12.1 實戰 tof star inter 傳遞默認參數調優一、Kafka消費者編程模型 1.分區消費模型分區消費偽代碼描述 main() 獲取分區的size for index =0 to size crea

《阿里巴巴MongoDB4.0高階實戰：基於Java Spring Boot 2.0》運維、監控、聚合、叢集、監控等高階面試題

《阿里巴巴MongoDB4.0高階實戰》阿里巴巴技術大牛資深專家P9葉翔、專家徐雷. NoSQL排名第一！最流行的NoSQL資料庫；谷歌、阿里巴巴、螞蟻金服、騰訊、百度等一線網際網路公司必備技能。本系列課程涵蓋：MongoDB入門命令、管理、聚合分析、核心架構、資料庫管理、匯入匯出、索引、

list源碼4(參考STL源碼--侯捷)：transfer、splice、merge、reverse、sort

bubuko uniq view col 進行 https out stl算法 details list源碼1(參考STL源碼--侯捷):list節點、叠代器、數據結構 list源碼2(參考STL源碼--侯捷):constructor、push_back、insert li

Vue.js學習記錄-15-Vue去哪兒網專案實戰：景點詳情頁開發-功能點概述 + Detail + Banner(通用元件：Gallery、Fade)

3. 景點詳情頁開發功能點概述使用者首頁點選熱銷推薦景點，即可跳轉景點詳情頁面。詳情頁面包括三部分內容：頂部圖片展示、景點門票詳情、隱藏頁面頭。(拖動至下方會出現) 使用者點選頂部圖片展示，進入圖片輪播區域(全屏)，可左右滑動進行圖片瀏覽

《Python高效開發實戰：Django、Tornado、Flask、Twisted》pdf下載

也許你聽說過全棧工程師，他們善於設計系統架構，精通資料庫建模、通用網路協議、後端併發處理、前端介面設計，在學術研究或工程專案上能獨當一面。通過對Python及其周邊Web框架的學習和實踐，你就可以成為這樣的全能型人才。本書分為3部分：第1部分是基礎篇，帶領初學者實踐Python開發環境和

Hadoop2.x實戰：WordCount、Sort、去重複、average例項MapRedure編寫

一、 Hadoop做的一個計算單詞的例項

二、Sort排序例項

三、去重例項

四、求平均數

相關推薦