結合案例講解MapReduce重要知識點 -------- 使用自定義資料實現記憶體排序

阿新 • • 發佈：2018-12-20

自定義資料WCData

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;

/**
 * 自定義資料型別
 * @author lyd
 *
 */
public class WCData implements WritableComparable<WCData>{

	public String word;
	public int counter;
	
	public WCData(){
		
	}
	
	public WCData(String word, int counter) {
		this.word = word;
		this.counter = counter;
	}

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(word);
		out.writeInt(counter);
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		this.word = in.readUTF();
		this.counter = in.readInt();
	}

	@Override
	public int compareTo(WCData o) {
		int tmp = o.counter - this.counter; //降序
		//int tmp = this.counter - o.counter; //升序
		if(tmp != 0){
			return tmp;
		} 
		return 0;
	}
	
	
	@Override
	public int hashCode() {
		final int prime = 31;
		int result = 1;
		result = prime * result + counter;
		result = prime * result + ((word == null) ? 0 : word.hashCode());
		return result;
	}

	@Override
	public boolean equals(Object obj) {
		if (this == obj)
			return true;
		if (obj == null)
			return false;
		if (getClass() != obj.getClass())
			return false;
		WCData other = (WCData) obj;
		if (counter != other.counter)
			return false;
		if (word == null) {
			if (other.word != null)
				return false;
		} else if (!word.equals(other.word))
			return false;
		return true;
	}

	/**
	 * @return the word
	 */
	public String getWord() {
		return word;
	}

	/**
	 * @param word the word to set
	 */
	public void setWord(String word) {
		this.word = word;
	}

	/**
	 * @return the counter
	 */
	public int getCounter() {
		return counter;
	}

	/**
	 * @param counter the counter to set
	 */
	public void setCounter(int counter) {
		this.counter = counter;
	}

	/* (non-Javadoc)
	 * @see java.lang.Object#toString()
	 */
	@Override
	public String toString() {
		return word + ":" + counter;
	}

	
}

MapReduce類

 import java.io.IOException;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import edu.qianfeng.mr.day01.WordCountWritable;


/**
 * 
 * @author lyd
 *
 *TOP-N
 *求前幾名
 * 資料：
 hello qianfeng hello qianfeng qianfeng is best qianfeng better
 hadoop is good
 spark is nice
 
 取統計後的前三名：
 qianfeng 4
 is 3
 hello 2
 */
public class TopN extends ToolRunner implements Tool{

	/**
	 * 自定義的myMapper
	 * @author lyd
	 *
	 */
	static class MyMapper extends Mapper<LongWritable, Text, Text, Text>{

		@Override
		protected void setup(Context context)throws IOException, InterruptedException {
		}

		@Override
		protected void map(LongWritable key, Text value,Context context)
				throws IOException, InterruptedException {
			String line = value.toString();
			String lines [] = line.split(" ");
			for (String s: lines) {
				context.write(new Text(s), new Text(1+""));
			}
		}

		@Override
		protected void cleanup(Context context)throws IOException, InterruptedException {
		}
		
	}
	
	/**
	 * 自定義MyReducer
	 * @author lyd
	 *
	 */
	static class MyReducer extends Reducer<Text, Text, WCData, NullWritable>{

		@Override
		protected void setup(Context context)throws IOException, InterruptedException {
		
		}
		//獲取treeset物件
		TreeSet<WCData> ts = new TreeSet<WCData>();
		public static final int k = 5;
		@Override
		protected void reduce(Text key, Iterable<Text> value,Context context)
				throws IOException, InterruptedException {
			int counter = 0;
			for (Text t : value) {
				counter += Integer.parseInt(t.toString());
			}
			
			//建立資料物件
			WCData wc = new WCData(key.toString(), counter);
			//將資料物件放到treeset中
			ts.add(wc);
			if(ts.size() > k){
				//移除元素
				ts.remove(ts.last());
			}
			//
			//context.write(wc, null);
		}
		
		@Override
		protected void cleanup(Context context)throws IOException, InterruptedException {
			for (WCData wcData : ts) {
				context.write(wcData, null);
			}
		}
	}
	
	
	@Override
	public void setConf(Configuration conf) {
		conf.set("fs.defaultFS", "hdfs://hadoop01:9000");
	}

	@Override
	public Configuration getConf() {
		return new Configuration();
	}
	
	/**
	 * 驅動方法
	 */
	@Override
	public int run(String[] args) throws Exception {
		//1、獲取conf物件
		Configuration conf = getConf();
		//2、建立job
		Job job = Job.getInstance(conf, "model01");
		//3、設定執行job的class
		job.setJarByClass(TopN.class);
		//4、設定map相關屬性
		job.setMapperClass(MyMapper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		FileInputFormat.addInputPath(job, new Path(args[0]));
		
		//5、設定reduce相關屬性
		job.setReducerClass(MyReducer.class);
		job.setOutputKeyClass(WCData.class);
		job.setOutputValueClass(NullWritable.class);
		//判斷輸出目錄是否存在，若存在則刪除
		FileSystem fs = FileSystem.get(conf);
		if(fs.exists(new Path(args[1]))){
			fs.delete(new Path(args[1]), true);
		}
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		//6、提交執行job
		int isok = job.waitForCompletion(true) ? 0 : 1;
		return isok;
	}
	
	/**
	 * job的主入口
	 * @param args
	 */
	public static void main(String[] args) {
		try {
			//對輸入引數作解析
			String [] argss = new GenericOptionsParser(new Configuration(), args).getRemainingArgs();
			System.exit(ToolRunner.run(new TopN(), argss));
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

}

結合案例講解MapReduce重要知識點 ----------- 自定義MapReduce資料型別（1）重寫Writable介面

重寫Writable介面如下程式碼就是自定義mr資料型別，在wordcount類使用它。 WordCountWritable import java.io.DataInput; import java.io.DataOutput; import java.io.IOE

結合案例講解MapReduce重要知識點 -------- 使用自定義資料實現記憶體排序

自定義資料WCData import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.WritableComparab

結合案例講解MapReduce重要知識點 ------- 使用自定義MapReduce資料型別實現二次排序

自定義資料型別SSData import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.WritableCompa

結合案例講解MapReduce重要知識點 --------- 簡單排序

import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.

結合案例講解MapReduce重要知識點 -------- 記憶體排序

TOP N 資料： hello qianfeng hello qianfeng qianfeng is best qianfeng better hadoop is good spark is nice 取統計後的前三名： qianfeng 4 is

結合案例講解MapReduce重要知識點 --------- 多表連線

第一張表的內容： login： uid sexid logindate 1 1 2017-04-17 08:16:20 2 2 2017-04-15 06:18:20 3 1 2017-04-16 05:16:24 4 2 2017-04-14 03:18:20

C++ STL 有關於SET集合部分的自定義資料型別的排序以及 pair的使用

#include<iostream> using namespace std; #include"set" //set集合容器的標頭檔案 #include<functional> //用於識別set從大到小中greater的識別 /*知識點集合*/

MapReduce實戰：自定義輸入格式實現成績管理

stat app 註意 false exce 考試成績 fileinput collect 劃分 1. 項目需求　　我們取有一份學生五門課程的期末考試成績數據，現在我們希望統計每個學生的總成績和平均成績。樣本數據如下所示，每行數據的數據格式為：學號、

MapReduce系列之自定義Partitioner

partitioner定義：分割槽器 partitioner的作用是將mapper（如果使用了combiner的話就是combiner）輸出的key/value拆分為分片（shard），每個reducer對應一個分片。預設情況下，partitioner先計算key的雜湊值（通常為md5值）。然後

大資料-Hadoop生態(15)-MapReduce框架原理-自定義FileInputFormat

1. 需求將多個小檔案合併成一個SequenceFile檔案（SequenceFile檔案是Hadoop用來儲存二進位制形式的key-value對的檔案格式），SequenceFile裡面儲存著多個檔案，儲存的形式為檔案路徑+名稱為key，檔案內容為value 三個小檔案 on

Hadoop MapReduce自定義資料型別

一自定義資料型別的實現 1.繼承介面Writable,實現其方法write()和readFields(), 以便該資料能被序列化後完成網路傳輸或檔案輸入/輸出； 2.如果該資料需要作為主鍵key使用，或需要比較數值大小時，則需要實現WritalbeComparable介面,實現其方法write(),re

第十八天 -- MapReduce自定義資料型別

第十八天 – MapReduce自定義資料型別一、多檔案輸出 import java.io.IOException; import java.util.StringTokenizer; import org.apache.hadoop.conf.Confi

Mapreduce自定義資料型別

Hadoop自帶的資料型別： Intwritable,LongWritable,Text,xxWritable. 某些情況下：使用自定義的資料型別方便一些（類似java中的pojo）。實現：實現writableComparable介面即可。場景例如：成

MapReduce-XML處理-自定義InputFormat及自定義RecordReader

這一篇說明如何自定義InputFormat以及RecordReader這兩個元件，通過使用mapreduce處理xml檔案格式的檔案來說明其用法，這一個例子來自《hadoop硬實戰》一書的技術點12講解的用法，如果有說明得不清楚的可以自行進行查閱下面就來說說這個例項要達到的目

Mapreduce中的自定義型別、分組與二次排序

0、需求說明資料格式期望輸出的結果做簡單分析： a. 由於只有兩列，所以可以將map的InputFormat設定為KeyValueTextInputFormat b. 事實上這裡實現了兩個排序，即對輸出的k

自定義物件實現 MapReduce 框架的序列化及排序

如果需要將自定義的 bean 放在 key 中傳輸，則還需要實現 Comparable 介面，因為 MapReduce框中的 shuffle 過程一定會對 key 進行排序，此時，自定義的 bean 實現的介面應該是：public class FlowBean impleme

MapReduce-自定義Key-二次排序

這個例項緊接上一個TopK的例項最後留下的一個問題的解決以及對新的一個技術點的說明，如何自定義輸入輸出的資料型別，這裡也大概引出mapreduce中二次排序的大致思想，但不著重說明二次排序，只是大致說

Hadoop系列-MapReduce自定義資料型別（序列化、反序列化機制）（十二）

Github程式碼下載地址：大家都知道，Hadoop中為Key的資料型別必須實現WritableComparable介面，而Value的資料型別只需要實現Writable介面即可；能做Key的一定可以做Value，能做Value的未必能做Key。但是具體應該怎麼應

mapreduce【流量統計】求和——自定義資料型別

需求：統計一下檔案中，每一個使用者所耗費的總上行流量，總下行流量，總流量 1363157985066 13726230503

Android -- 自定義view實現keep歡迎頁倒計時效果

super onfinish -m use new getc awt ttr alt 1，最近打開keep的app的時候，發現它的歡迎頁面的倒計時效果還不錯，所以打算自己來寫寫，然後就有了這篇文章。 2，還是老規矩，先看一下我們今天實現的效果　　相較於我們常見的倒計時

結合案例講解MapReduce重要知識點 -------- 使用自定義資料實現記憶體排序

自定義資料WCData

MapReduce類

相關推薦