1. 程式人生 > >Hadoop——自定義資料型別,實現WritableComparable, 並且 分組,排序

Hadoop——自定義資料型別,實現WritableComparable, 並且 分組,排序

http://blog.csdn.net/u014432433/article/details/51104026

1. 在進行mapreduce程式設計時key鍵往往用於分組或排序,當我們在進行這些操作時Hadoop內建的key鍵資料型別不能滿足需求時,

或針對用例優化自定義資料型別可能執行的更好。因此可以通過實現org.apache.hadoop.io.WritableComparable介面定義一個自定義的WritableComparable型別,並使其作為mapreduce計算的key型別。

2.自定義Hadoop key型別。
   1.Hadoop mapreduce的key型別往往用於進行相互比較, 可以達到進行相互比較來滿足排序的目的。
   2.Hadoop Writable資料型別實現了WritableComparable<T>介面,並增加了CompareTo()方法。
      CompaeTo()方法的返回值有三種類型。負整數、0、正整數分別對應小於、等於、大於被比較物件。

3. 例項 先統計一個手機號碼的上下行及總流量, 再對結果按總流量排序

日誌檔案 *.dat

1363157985066 	13726230503	00-FD-07-A4-72-B8:CMCC	120.196.100.82	i02.c.aliimg.com		24	27	2481	24681	200
1363157995052 	13826544101	5C-0E-8B-C7-F1-E0:CMCC	120.197.40.4			4	0	264	0	200
1363157991076 	13926435656	20-10-7A-28-CC-0A:CMCC	120.196.100.99			2	4	132	1512	200
1363154400022 	13926251106	5C-0E-8B-8B-B1-50:CMCC	120.197.40.4			4	0	240	0	200
1363157993044 	18211575961	94-71-AC-CD-E6-18:CMCC-EASY	120.196.100.99	iface.qiyi.com	視訊網站	15	12	1527	2106	200
1363157995074 	84138413	5C-0E-8B-8C-E8-20:7DaysInn	120.197.40.4	122.72.52.12		20	16	4116	1432	200
1363157993055 	13560439658	C4-17-FE-BA-DE-D9:CMCC	120.196.100.99			18	15	1116	954	200
1363157995033 	15920133257	5C-0E-8B-C7-BA-20:CMCC	120.197.40.4	sug.so.360.cn	資訊保安	20	20	3156	2936	200
1363157983019 	13719199419	68-A1-B7-03-07-B1:CMCC-EASY	120.196.100.82			4	0	240	0	200
1363157984041 	13660577991	5C-0E-8B-92-5C-20:CMCC-EASY	120.197.40.4	s19.cnzz.com	站點統計	24	9	6960	690	200
1363157973098 	15013685858	5C-0E-8B-C7-F7-90:CMCC	120.197.40.4	rank.ie.sogou.com	搜尋引擎	28	27	3659	3538	200
1363157986029 	15989002119	E8-99-C4-4E-93-E0:CMCC-EASY	120.196.100.99	www.umeng.com	站點統計	3	3	1938	180	200
1363157992093 	13560439658	C4-17-FE-BA-DE-D9:CMCC	120.196.100.99			15	9	918	4938	200
1363157986041 	13480253104	5C-0E-8B-C7-FC-80:CMCC-EASY	120.197.40.4			3	3	180	180	200
1363157984040 	13602846565	5C-0E-8B-8B-B6-00:CMCC	120.197.40.4	2052.flash2-http.qq.com	綜合門戶	15	12	1938	2910	200
1363157995093 	13922314466	00-FD-07-A2-EC-BA:CMCC	120.196.100.82	img.qfc.cn		12	12	3008	3720	200
1363157982040 	13502468823	5C-0A-5B-6A-0B-D4:CMCC-EASY	120.196.100.99	y0.ifengimg.com	綜合門戶	57	102	7335	110349	200
1363157986072 	18320173382	84-25-DB-4F-10-1A:CMCC-EASY	120.196.100.99	input.shouji.sogou.com	搜尋引擎	21	18	9531	2412	200
1363157990043 	13925057413	00-1F-64-E1-E6-9A:CMCC	120.196.100.55	t3.baidu.com	搜尋引擎	69	63	11058	48243	200
1363157988072 	13760778710	00-FD-07-A4-7B-08:CMCC	120.196.100.82			2	2	120	120	200
1363157985066 	13726238888	00-FD-07-A4-72-B8:CMCC	120.196.100.82	i02.c.aliimg.com		24	27	2481	24681	200
1363157993055 	13560436666	C4-17-FE-BA-DE-D9:CMCC	120.196.100.99			18	15	1116	954	200

FlowBean.java 自定義型別
package com.kevin.model;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class FlowBean implements WritableComparable<FlowBean> {
	
	private String phoneNbr;
	private long up_flow;
	private long d_flow;
	private long sum_flow;
	
	
	public void set(String phoneNbr, long up_flow, long d_flow){
		this.phoneNbr = phoneNbr;
		this.up_flow = up_flow;
		this.d_flow = d_flow;
		this.sum_flow = up_flow + d_flow;
	}
	
	/**
	 * 序列化,將資料欄位以位元組流寫出去
	 */
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(this.phoneNbr);
		out.writeLong(this.up_flow);
		out.writeLong(this.d_flow);
		out.writeLong(this.sum_flow);
	}
	
	/**
	 * 反序列化,從位元組流中讀出各個資料欄位 讀出的順序應該跟序列化時寫入的順序保持一致
	 */
	@Override
	public void readFields(DataInput in) throws IOException {
		this.phoneNbr = in.readUTF();
		this.up_flow = in.readLong();
		this.d_flow = in.readLong();
		this.sum_flow = in.readLong();
	}

	@Override
	public int compareTo(FlowBean o) {		
		return  this.sum_flow > o.getSum_flow() ? -1 : 1 ;
	}
	
	@Override
	public String toString() {		 
		return up_flow + "\t" + d_flow + "\t" + sum_flow;
	}

	public String getPhoneNbr() {
		return phoneNbr;
	}

	public void setPhoneNbr(String phoneNbr) {
		this.phoneNbr = phoneNbr;
	}

	public long getUp_flow() {
		return up_flow;
	}

	public void setUp_flow(long up_flow) {
		this.up_flow = up_flow;
	}

	public long getD_flow() {
		return d_flow;
	}

	public void setD_flow(long d_flow) {
		this.d_flow = d_flow;
	}

	public long getSum_flow() {
		return sum_flow;
	}

	public void setSum_flow(long sum_flow) {
		this.sum_flow = sum_flow;
	}	

}

AreaPartitioner.java 定義分組 繼承Partitioner 並實現分組的方法, 按手機號碼分組

package com.kevin.partitioner;

import java.util.HashMap;

import org.apache.hadoop.mapreduce.Partitioner;

public class AreaPartitioner<KEY, VALUE> extends Partitioner<KEY, VALUE>{
	
	private static HashMap<String, Integer> areaMap =  new HashMap<>();
	
	static {
		areaMap.put("136", 0);
		areaMap.put("137", 1);
		areaMap.put("138", 2);
		areaMap.put("139", 3);
	}
	
	@Override
	public int getPartition(KEY key, VALUE value, int numPartitions) {		
		Integer provinceCode = areaMap.get(key.toString().substring(0,3));		
		return provinceCode==null?4:provinceCode;
	}

}


FlowCount.java 對手機號碼進行流量統計

package com.kevin.mapreducedemo2;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import com.kevin.model.FlowBean;
import com.kevin.partitioner.AreaPartitioner;

//hadoop自己實現的序列化機制跟jdk有區別: 比jdk更精簡

public class FlowCount {
	
	public static class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean>{		
		private FlowBean flowBean = new FlowBean();
		@Override
		protected void map(LongWritable key, Text value,Context context)throws IOException, InterruptedException {
			// 拿到一行資料
			String line = value.toString();
			// 切分欄位
			String[] fields = StringUtils.split(line, "\t");
			// 拿到我們需要的若干個欄位
			String phoneNbr = fields[1];
			long up_flow = Long.parseLong(fields[fields.length - 3]);
			long d_flow = Long.parseLong(fields[fields.length - 2]);
			// 將資料封裝到一個flowbean中
			flowBean.set(phoneNbr, up_flow, d_flow);
			context.write(new Text(phoneNbr), flowBean);			
		}
	}
	
	public static class FlowCountReducer extends Reducer<Text, FlowBean, Text, FlowBean>{
		private FlowBean flowBean = new FlowBean();		
		@Override
		protected void reduce(Text key, Iterable<FlowBean> values,Context context) throws IOException, InterruptedException {
			long up_flow_sum = 0;
			long d_flow_sum = 0;
			for(FlowBean flowBean : values){
				up_flow_sum += flowBean.getUp_flow();
				d_flow_sum += flowBean.getD_flow();
			}
			flowBean.set(key.toString(), up_flow_sum, d_flow_sum);
			context.write(key, flowBean);
		}
	}
	
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf,"flowjob");		
		job.setJarByClass(FlowCount.class);
		
		job.setMapperClass(FlowCountMapper.class);
		job.setReducerClass(FlowCountReducer.class);
		
		/**
		 * 加入自定義分割槽定義 : AreaPartitioner
		 */
		job.setPartitionerClass(AreaPartitioner.class);
		
		/**
		 * 設定reduce task的數量,要跟AreaPartitioner返回的partition個數匹配
		 * 如果reduce task的數量比partitioner中分組數多,就會產生多餘的幾個空檔案
		 * 如果reduce task的數量比partitioner中分組數少,就會發生異常,因為有一些key沒有對應reducetask接收
		 * (如果reduce task的數量為1,也能正常執行,所有的key都會分給這一個reduce task)
		 * reduce task 或 map task 指的是,reuder和mapper在叢集中執行的例項
		 */
		job.setNumReduceTasks(5);
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(FlowBean.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(FlowBean.class);
		
		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);
		
		FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.1.108:9000/flow_count_demo/data-files/"));
		FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.1.108:9000/flow_count_demo/data-out/"));
		
		job.waitForCompletion(true);
		
	}
	
	
	
	
	
	
}	

FlowCountSort.java 對結果按總量排序

package com.kevin.mapreducedemo2;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import com.kevin.mapreducedemo2.FlowCount.FlowCountMapper;
import com.kevin.mapreducedemo2.FlowCount.FlowCountReducer;
import com.kevin.model.FlowBean;

public class FlowCountSort {
	
	public static class FlowCountSortMapper extends Mapper<LongWritable, Text, FlowBean, NullWritable>{
		private FlowBean bean = new FlowBean();
		@Override
		protected void map(LongWritable key, Text value,Context context)throws IOException, InterruptedException {
			String line = value.toString();			
			String[] fields = StringUtils.split(line, "\t");			
			String phoneNbr = fields[0];
			long up_flow = Long.parseLong(fields[1]);
			long d_flow = Long.parseLong(fields[2]);			
			bean.set(phoneNbr, up_flow, d_flow);
			
			context.write(bean, NullWritable.get());
		}
	}
	
	public static class FlowCountSortReducer extends Reducer<FlowBean, NullWritable, Text, FlowBean>{
		@Override
		protected void reduce(FlowBean bean, Iterable<NullWritable> values,Context context) throws IOException, InterruptedException {
			context.write(new Text(bean.getPhoneNbr()), bean);
		}
	}
	
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf,"flowjob");		
		job.setJarByClass(FlowCountSort.class);
		
		job.setMapperClass(FlowCountSortMapper.class);
		job.setReducerClass(FlowCountSortReducer.class);
		
		job.setMapOutputKeyClass(FlowBean.class);
		job.setMapOutputValueClass(NullWritable.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(FlowBean.class);
		
		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);
		
		FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.1.108:9000/flow_count_demo/data-out/"));
		FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.1.108:9000/flow_count_demo/data-out2/"));
		
		job.waitForCompletion(true);
	}
	
}