大資料（hadoop-mapreduce案例講解）

阿新 • • 發佈：2019-05-30

package com.vip;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


public class MapReduceCaseAvg extends Configured implements Tool{

	public static class AvgMapper extends Mapper<Object, Text, Text, IntWritable>{
		@Override
		protected void map(Object key, Text value, Context context)
				throws IOException, InterruptedException {
			//獲取檔案內容
			String content = value.toString() ;
			//字串切分
			StringTokenizer st = new StringTokenizer(content) ;
			while(st.hasMoreElements()){
				String strName = st.nextToken() ;	//學員姓名
				String strSorce = st.nextToken() ;	//學員成績
				//輸出key，value
				context.write(new Text(strName),  new IntWritable(Integer.parseInt(strSorce)));
			}
		}
	}
	
	public static class AvgReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
		//<張三 ，{98，89，79}>
		@Override
		protected void reduce(Text key, Iterable<IntWritable> values,
				Context context) throws IOException, InterruptedException {
			//平均值，即使將所有的成績相加除以科目數
			int sum = 0 ;	//總成績
			int num = 0 ;	//總科目
			for (IntWritable score : values) {
				sum += score.get() ;	//累加每門課得成績
				num ++ ;
			}
			context.write(key,  new IntWritable((int)sum/num));
		}
	}
	
	
	@Override
	public int run(String[] args) throws Exception {
		//任務和引數
		Job job = Job.getInstance(getConf(), "avg mr") ;
		job.setJarByClass(MapReduceCaseAvg.class);
		
		/*設定map方法的類*/
		job.setMapperClass(AvgMapper.class);
		job.setReducerClass(AvgReducer.class);
		
		/*設定輸出的key和value的型別*/
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		/*設定輸入輸出引數*/
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		/*提交作業到叢集並等待任務完成*/
		boolean isSuccess = job.waitForCompletion(true);
		
		return isSuccess ? 0 : 1 ;
	}
	
	public static void main(String[] args) throws Exception {
		int res = ToolRunner.run(new MapReduceCaseAvg(), args) ;
		System.exit(res);
	}
}

package com.vip;

import java.io.IOException;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class MapReduceCaseFilte extends Configured implements Tool {
	
	public static class FilterMapper extends Mapper<Object, Text, NullWritable, Text>{
		@Override
		protected void map(Object key, Text value, Context context)
				throws IOException, InterruptedException {
			//以空格切分欄位
			String[] strSrc = value.toString().split(" ");
			//拼接字串
			String strDst = strSrc[0] + " " + strSrc[1] + " " + strSrc[2] + " " + strSrc[6] ;
			context.write(NullWritable.get(), new Text(strDst));
		}
	}
	
	
	@Override
	public int run(String[] args) throws Exception {
		Job job = Job.getInstance(getConf(), "mrfilter") ;
		job.setJarByClass(MapReduceCaseFilte.class);
		
		/*設定map方法的類*/
		job.setMapperClass(FilterMapper.class);
		
		/*設定輸出的key和value的型別*/
		job.setOutputKeyClass(NullWritable.class);
		job.setOutputValueClass(Text.class);
		
		/*設定輸入輸出引數*/
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		/*提交作業到叢集並等待任務完成*/
		boolean isSuccess = job.waitForCompletion(true);
		
		return isSuccess ? 0 : 1 ;
	}

	public static void main(String[] args) throws Exception {
		int res = ToolRunner.run(new MapReduceCaseFilte(), args) ;
		System.exit(res);
	}
}

// cat act  
// tar art

//<act,{cat,tac,cta}>

package com.vip;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


public class MapReduceCaseWords extends Configured implements Tool{

	
	@Override
	public int run(String[] args) throws Exception {
		Configuration conf = new Configuration() ;
		//刪除已經存在的輸出目錄
		Path mypath = new Path(args[1]) ;
		FileSystem hdfs = mypath.getFileSystem(conf);
		if(hdfs.isDirectory(mypath)){
			hdfs.delete(mypath, true) ;
		}
		
		//設定任務資訊
		Job job = Job.getInstance(conf, "words mr") ;
		job.setJarByClass(MapReduceCaseWords.class);
		
		/*設定map方法的類*/
		job.setMapperClass(WordsMapper.class);
		
		job.setReducerClass(WordsReducer.class);
		
		/*設定輸出的key和value的型別*/
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		
		/*設定輸入輸出引數*/
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		/*提交作業到叢集並等待任務完成*/
		boolean isSuccess = job.waitForCompletion(true);
		
		return isSuccess ? 0 : 1 ;
	}
	
	public static void main(String[] args) throws Exception {
		String[] args0 = {"hdfs://192.168.153.111:9000/input5",
				"hdfs://192.168.153.111:9000/output12"} ;
		int res = ToolRunner.run(new MapReduceCaseWords(), args0) ;
		System.exit(res);
	}

}

package com.vip;

import java.io.IOException;
import java.util.Arrays;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class WordsMapper extends Mapper<Object, Text, Text, Text>{
	private Text keyText = new Text() ;
	private Text valueText = new Text() ;
	
	@Override
	protected void map(Object key, Text value, Context context)
			throws IOException, InterruptedException {
		String word = value.toString() ;
		char[] wordChars = word.toCharArray();	//單詞轉化為字元陣列
		Arrays.sort(wordChars); 				//對字元陣列進行排序
		String sword = new String(wordChars) ;	//字元陣列在轉化為字串
		keyText.set(sword);              		//設定輸出key
		valueText.set(word);  					//設定輸出得value得值
		context.write(keyText, valueText);		//map輸出
	}
}

package com.vip;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WordsReducer extends Reducer<Text, Text, Text, Text>{
	private Text outputKey = new Text() ;	//輸出key
	private Text outputValue = new Text() ;	//輸出的value
	
	@Override
	protected void reduce(Text key, Iterable<Text> values, Context context)
			throws IOException, InterruptedException {
		String output ="" ;
		//對相同字母組成的單詞，使用~符號進行拼接
		for (Text word : values) {
			if(!output.equals("")){
				output = output + "~" ;
			}
			output = output + word.toString() ;
		}
		//輸出有兩個單詞或以上的結果
		StringTokenizer outputTokenize = new StringTokenizer(output, "~") ;
		if(outputTokenize.countTokens() >= 2){
			output = output.replaceAll("~", ",") ;
			outputKey.set(key.toString()); 			//設定key的值
			outputValue.set(output);				//設定value的值
			context.write(outputKey, outputValue);	//輸出
		}
	}
	
	
}

大資料（hadoop-mapreduce案例講解）

package com.vip; import java.io.IOException; import java.util

大資料（hadoop-flume案例講解）

a2.cnf #定義agent名稱，source，channel，sink的名稱 #a1就是我們給agent起的名字，

大資料（hadoop-mapreduce程式設計應用）

package demo; import java.io.*; import org.apache.hadoop.*; i

大資料（hadoop-mapreduce程式碼及程式設計模型講解）

MapReduce程式設計模型 MapReduce將整個執行過程分為兩個階段： Map階段和Reduce階段 Map階段由

大資料（hadoop-小檔案合併、Mapreduce原理）

hadoop-小檔案合併 package com.andy.merge; import org.apache.hadoo

分享知識-快樂自己：Liunx-大資料（Hadoop）初始化環境搭建

大資料初始化環境搭建：一）：大資料（hadoop）初始化環境搭建二）：大資料（hadoop）環境搭建三）：執行wordcount案例四）：揭祕HDFS 五）：揭祕MapReduce 六）：揭祕HBase 七）：HBase程式設計 ----------------------------

分享知識-快樂自己：大資料（hadoop）環境搭建

大資料 hadoop 環境搭建：一）：大資料（hadoop）初始化環境搭建二）：大資料（hadoop）環境搭建三）：執行wordcount案例四）：揭祕HDFS 五）：揭祕MapReduce 六）：揭祕HBase 七）：HBase程式設計 -----------------------

大資料（hadoop-自定義資料型別、檔案格式）

自定義InputFormat OutputFormat 示例程式碼 package com.vip09;

大資料（hadoop分散式搭建和yarn）

分散式搭建步驟 1：克隆一臺機器完成後，按以下步驟進行修改（作為源克隆主機） 1）修改網

大資料（hadoop-flume的原理架構）

背景介紹 Hadoop提供了一箇中央化的儲存系統有利於進行集中式的資料分析與資料共享 Hadoo

大資料（hadoop-資料入庫系統Sqoop原理架構）

Sqoop是什麼 Sqoop：SQL-to-Hadoop 連線傳統關係型資料庫和Hadoop的橋樑 &nb

c++ fstream + string 處理大資料（與c 的fread）

一：起因（1）之前處理文字資料時，各種清洗資料用的都是java的File,FileReader/FileWriter,BufferedReader/BufferedWriter等類，（2）應用java的原因是java裡面的map非常靈活，eclipse編譯器更是給力，而且

大資料（HBase-程式設計java api）

開發環境搭建步驟 1：解壓下載下來的hbase的安裝包 2：配置windows的hosts檔案，地址：C:\Windows\

大資料之Hadoop（MapReduce（四））------->企業優化

6.1 MapReduce 跑的慢的原因 Mapreduce 程式效率的瓶頸在於兩點： 1）計算機效能 CPU、記憶體、磁碟健康、網路 2）I/O 操作優化（1）資料傾斜（2）map和reduce數設定不合理（3）reduce等待過久（4）小檔案過多

大資料（十五）：Hadoop資料壓縮與壓縮/解壓縮例項

一、資料壓縮 1.概論壓縮技術能夠有效減少低層儲存系統（HDFS）讀寫位元組。壓縮提高了網路頻寬和磁碟空間的效率。在Hadoop下，尤其是資料規模很大和工作負載密集的情況下。使用資料壓縮閒的非常重要。在這種情況下，I/O操作

大資料之Hadoop學習（環境配置）——Hadoop偽分散式叢集搭建

title: Hadoop偽分散式叢集搭建 date: 2018-11-14 15:17:20 tags: Hadoop categories: 大資料點選檢視我的部落格: Josonlee’s Blog 文章目錄前言準備偽分

初識大資料（二. Hadoop是什麼）

hadoop是一個由Apache基金會所釋出的用於大規模叢集上的分散式系統並行程式設計基礎框架。目前已經是大資料領域最流行的開發架構。並且已經從HDFS、MapReduce、Hbase三大核心元件成長為一個具有60多個元件構成的龐大生態，可以滿足大資料採集、儲存、開發、分析、演算法、建模等方方面面。在ha

初識大資料（三. Hadoop與MPP資料倉庫）

　　MPP代表大規模並行處理，這是網格計算中所有單獨節點參與協調計算的方法。是將任務並行的分散到多個伺服器和節點上，在每個節點上計算完成後，將各自部分的結果彙總在一起得到最終的結果。 MPP DBMS是建立在這種方法之上的資料庫管理系統。在這些系統中的每個查詢都會被分解為由MPP網格的節點並行執行

10小時入門大資料（二）------初識Hadoop

10小時入門大資料（二）——初識Hadoop 1、Hadoop介紹開源、分散式儲存+分散式計算平臺 2、Hadoop能做什麼搭建大型資料倉庫、PB級資料儲存、處理、分析、統計等搜尋引擎、日誌分析、商業智慧、資料探勘 3、核心元件之分散式檔案系統

最近經歷的一些大資料（Spark/Hadoop）面試題

http://blog.csdn.net/erfucun/article/details/52275369 /** * Create an input stream that directly pulls messages from Kafka Brokers * without usin

大資料（hadoop-mapreduce案例講解）

相關推薦