MapReduce實現倒排索引
阿新 • • 發佈:2018-12-15
倒排索引這個名字讓人很容易誤解成A-Z,倒排成Z-A;但實際上缺不是這樣的。
一般我們是根據問檔案來確定檔案內容,而倒排索引是指通過檔案內容來得到文件的資訊,也就是根據一些單詞判斷他在哪個檔案中。
知道了這一點下面就好做了:
準備一些元資料
下面我們要進行兩次MapReduce處理
第一次資料處理
package com.invalid; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public static void main(String[] args) throws Exception { //建立連線 Configuration conf = new Configuration(); Job job =Job.getInstance(conf); //設定要執行的jar job.setJarByClass(InvalidDriver.class); //設定map和reduce job.setMapperClass(InvalidMapper.class); job.setReducerClass(InvalidReducer.class); //設定map輸出 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); //設定reduce輸出 job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); //設定輸入輸出路徑 FileInputFormat.setInputPaths(job,new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); //程式執行結果 boolean result = job.waitForCompletion(true); System.out.println(result); } } class InvalidMapper extends Mapper<LongWritable,Text,Text,IntWritable>{ String name; Text k = new Text(); IntWritable v = new IntWritable(); //setup裡讀取檔名,且只執行一次 @Override protected void setup(Context context) throws IOException ,InterruptedException { FileSplit split = (FileSplit) context.getInputSplit(); name = split.getPath().getName(); }; @Override protected void map(LongWritable key, Text value,Context context) throws IOException ,InterruptedException { String[] split = value.toString().split(" "); for (String word : split) { context.write(new Text(word+"--"+ name), new IntWritable(1)); } }; } class InvalidReducer extends Reducer<Text, IntWritable, Text, IntWritable>{ @Override protected void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException { int count = 0; for (IntWritable value : values) { count+=value.get(); } context.write(new Text(key), new IntWritable(count)); } }
第一次處理結果如下
第二次處理
package com.invalid; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class InvalidDriver2 { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job =Job.getInstance(conf); job.setJarByClass(InvalidDriver2.class); job.setMapperClass(InvalidMapper2.class); job.setReducerClass(InvalidReducer2.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(job,new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); boolean result = job.waitForCompletion(true); System.out.println(result); } } class InvalidMapper2 extends Mapper<LongWritable,Text,Text,Text>{ @Override protected void map(LongWritable key, Text value,Context context) throws IOException ,InterruptedException { String[] split = value.toString().split("--"); String keys=split[0]; String values=split[1]; context.write(new Text(keys),new Text(values)); } } class InvalidReducer2 extends Reducer<Text, Text, Text, Text>{; @Override protected void reduce(Text key, Iterable<Text> values,Context context) throws IOException, InterruptedException { //String[] split = values.toString().split("\t"); //String value=null; StringBuilder sb = new StringBuilder(); for (Text string: values) { sb.append(string.toString().replaceAll("\t", "-->")+" "); } context.write(new Text(key), new Text(sb.toString())); } }