1. 程式人生 > >MapReduce實現倒排索引

MapReduce實現倒排索引

倒排索引這個名字讓人很容易誤解成A-Z,倒排成Z-A;但實際上缺不是這樣的。
一般我們是根據問檔案來確定檔案內容,而倒排索引是指通過檔案內容來得到文件的資訊,也就是根據一些單詞判斷他在哪個檔案中。
知道了這一點下面就好做了:

準備一些元資料

在這裡插入圖片描述
下面我們要進行兩次MapReduce處理

第一次資料處理

package com.invalid;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public static void main(String[] args) throws Exception {
                 //建立連線
		Configuration conf = new Configuration();
		Job job =Job.getInstance(conf);
		//設定要執行的jar
		job.setJarByClass(InvalidDriver.class);
		//設定map和reduce
		job.setMapperClass(InvalidMapper.class);
		job.setReducerClass(InvalidReducer.class);
		//設定map輸出
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		//設定reduce輸出
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);	
		//設定輸入輸出路徑
		FileInputFormat.setInputPaths(job,new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));	
	    //程式執行結果
		boolean result = job.waitForCompletion(true);
		System.out.println(result);		
	}
}
class InvalidMapper extends Mapper<LongWritable,Text,Text,IntWritable>{
	String name;
	Text k = new Text();
	IntWritable v = new IntWritable();
	//setup裡讀取檔名,且只執行一次
	@Override
	protected void setup(Context context) 
			throws IOException ,InterruptedException {
		FileSplit split = (FileSplit) context.getInputSplit();
		name = split.getPath().getName();
	};
	@Override
	protected void map(LongWritable key, Text value,Context context)
			throws IOException ,InterruptedException {
		String[] split = value.toString().split(" ");
		for (String word : split) {
			context.write(new Text(word+"--"+ name), new IntWritable(1));
		}
	}; 	
}
class InvalidReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
	@Override
	protected void reduce(Text key, Iterable<IntWritable> values,Context context)
			throws IOException, InterruptedException {
		int count = 0;
		for (IntWritable value : values) {
			count+=value.get();
		}
		context.write(new Text(key), new IntWritable(count));
	}
}

第一次處理結果如下

在這裡插入圖片描述

第二次處理

package com.invalid;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class InvalidDriver2 {
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		Job job =Job.getInstance(conf);
		job.setJarByClass(InvalidDriver2.class);
		job.setMapperClass(InvalidMapper2.class);
		job.setReducerClass(InvalidReducer2.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		FileInputFormat.setInputPaths(job,new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		boolean result = job.waitForCompletion(true);
		System.out.println(result);
	}
}
class InvalidMapper2 extends Mapper<LongWritable,Text,Text,Text>{
	@Override
	protected void map(LongWritable key, Text value,Context context)
			throws IOException ,InterruptedException {
		String[] split = value.toString().split("--");
		String keys=split[0];	
		String values=split[1];
		context.write(new Text(keys),new Text(values));	
}
}
class InvalidReducer2 extends Reducer<Text, Text, Text, Text>{;
	@Override
	protected void reduce(Text key, Iterable<Text> values,Context context)
			throws IOException, InterruptedException {
		//String[] split = values.toString().split("\t");
		//String value=null;
		StringBuilder sb = new StringBuilder(); 
		for (Text string: values) {
			
		sb.append(string.toString().replaceAll("\t", "-->")+" ");	
		}
		context.write(new Text(key), new Text(sb.toString()));	
}
}

第二次處理結果

在這裡插入圖片描述

至此,倒排索引完成