1. 程式人生 > >mapreduce的join演算法程式設計案例

mapreduce的join演算法程式設計案例

mapreduce程式設計案例

map端的join演算法

1、原理闡述

適用於關聯表中有小表的情形,可以將小表傳送到所有的map節點,這樣map節點就可以在本地對自己讀到的大表資料進行join並輸出最終結果,可以大大提高join操作的併發度,加快處理速度

2、例項:

兩表資料:

商品表資料
p0001,小米5,1000,2000
p0002,錘子T1,1000,3000
訂單表資料
1001,20150710,p0001,2
1002,20150710,p0002,3
1002,20150710,p0003,3

編寫map類

import org.apache.hadoop.filecache.DistributedCache;
import
org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.BufferedReader;
import java.io.IOException; import java.io.InputStreamReader; import java.net.URI; import java.util.HashMap; public class joinMap extends Mapper<LongWritable, Text, Text, Text> { HashMap<String, String> map = new HashMap<String, String>(); String line = null; /** * 在map端的初始化方法中獲取快取檔案,一次性載入到map中 * * @param context * @throws IOException * @throws InterruptedException */
@Override protected void setup(Context context) throws IOException, InterruptedException { Path[] localCacheFiles = DistributedCache.getLocalCacheFiles(context.getConfiguration()); //獲得所有的快取檔案 URI[] cacheFiles = DistributedCache.getCacheFiles(context.getConfiguration()); //獲得檔案系統 FileSystem fileSystem = FileSystem.get(cacheFiles[0], context.getConfiguration()); FSDataInputStream open = fileSystem.open(new Path(cacheFiles[0])); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(open)); while ((line = bufferedReader.readLine()) != null) { String[] split = line.split(","); map.put(split[0], split[1] + "\t" + split[2] + "\t" + split[3]); } fileSystem.close(); IOUtils.closeStream(bufferedReader); } @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //這裡讀的是這個map task所負責的那一個切片資料(在hdfs上) String[] fields = value.toString().split(","); String orderId = fields[0]; String date = fields[1]; String pdId = fields[2]; String amount = fields[3]; //獲取map當中的商品詳細資訊 String productInfo = map.get(pdId); context.write(new Text(orderId), new Text(date + "\t" + productInfo + "\t" + amount)); } }

編寫main

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.net.URI;

public class joinJobMain extends Configured implements Tool {
    @Override
    public int run(String[] strings) throws Exception {
        Configuration conf = super.getConf();
        //注意,這裡的快取檔案的新增,只能將快取檔案放到hdfs檔案系統當中,放到本地載入不到
        DistributedCache.addCacheFile(new URI("hdfs://node01:8020/cachefile/pdts.txt"), conf);
        Job job = Job.getInstance(conf, joinJobMain.class.getSimpleName());
        job.setJarByClass(joinJobMain.class);
        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.addInputPath(job, new Path("file:///d:\\map端join\\map_join_input"));
        job.setMapperClass(joinMap.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        TextOutputFormat.setOutputPath(job, new Path("file:///d:\\map端join\\map_join_output"));
        boolean b = job.waitForCompletion(true);
        return b ? 0 : 1;

    }

    public static void main(String[] args) throws Exception {
        Configuration configuration = new Configuration();
        ToolRunner.run(configuration, new joinJobMain(), args);
    }
}

reduce端join演算法的缺陷:

缺點:這種方式中,join的操作是在reduce階段完成,reduce端的處理壓力太大,map節點的運算負載則很低,資源利用率不高,且在reduce階段極易產生資料傾斜