Hadoop和Spark分別實現二次排序

阿新 • • 發佈：2019-01-19

將下列資料中每個分割槽中的第一列順序排列，第二列倒序排列。

Text

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37

2021
5051
5052
5053
5054
6051
6053
6052
6056
6057
7058
6061
7054
7055
7056
7057
7058
1055
8067
9043
3044
5067
5087
4077
2011
1055
2084
7045
9055
9144
7844
7632
8823
9134
5611
3323
2411

使用Hadoop

寫法一，參考《Hadoop權威指南》改寫：

IntPair類

Java Code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86

package com.hadoop.mr.sort;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.WritableComparable;

publicclass IntPair implements WritableComparable<IntPair> {
    private IntWritable first;
    private IntWritable second;

    public

void set(IntWritable first, IntWritable second) {
        this.first = first;
        this.second = second;
    }
    //注意：需要新增無參的構造方法，否則反射時會報錯。
public IntPair() {
        set(new IntWritable(), new IntWritable());
    }
    public IntPair(int first, int second) {
        set(new IntWritable(first), new IntWritable(second));
    }

    public IntPair(IntWritable first, IntWritable second) {
        set(first, second);
    }

    public IntWritable getFirst() {
        return first;
    }

    publicvoid setFirst(IntWritable first) {
        this.first = first;
    }

    public IntWritable getSecond() {
        return second;
    }

    publicvoid setSecond(IntWritable second) {
        this.second = second;
    }

    @Override
    publicvoid write(DataOutput out) throws IOException {
        first.write(out);
        second.write(out);
    }

    @Override
    publicvoid readFields(DataInput in) throws IOException {
        first.readFields(in);
        second.readFields(in);
    }

    @Override
    publicint hashCode() {
        return first.hashCode() * 163 + second.hashCode();
    }

    @Override
    publicboolean equals(Object o) {
        if (o instanceof IntPair) {
            IntPair tp = (IntPair) o;
            return first.equals(tp.first) && second.equals(tp.second);
        }
        return false;
    }

    @Override
    publicString toString() {
        return first + "\t" + second;
    }

    @Override
    publicint compareTo(IntPair tp) {
        int cmp = first.compareTo(tp.first);
        if (cmp != 0) {
            return cmp;
        }
        return second.compareTo(tp.second);
    }
}

Secondary類

Java Code

package com.hadoop.mr.sort;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

publicclass SecondarySort {
    staticclass TheMapper extends Mapper<LongWritable, Text, IntPair, NullWritable> {
        @Override
        protectedvoid map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            String[] fields = value.toString().split("\t");
            int field1 = Integer.parseInt(fields[0]);
            int field2 = Integer.parseInt(fields[1]);
            context.write(new IntPair(field1,field2), NullWritable.get());
        }
    }

    staticclass TheReducer extends Reducer<IntPair, NullWritable,IntPair, NullWritable> {
        //private static final Text SEPARATOR = new Text("------------------------------------------------");
        @Override
        protectedvoid reduce(IntPair key, Iterable<NullWritable> values, Context context)
                throws IOException, InterruptedException {
            context.write(key, NullWritable.get());
        }
    }

    publicstaticclass FirstPartitioner extends Partitioner<IntPair, NullWritable> {

        @Override
        publicint getPartition(IntPair key, NullWritable value,
                int numPartitions) {
            return Math.abs(key.getFirst().get()) % numPartitions;
        }

    }

    //如果不新增這個類，預設第一列和第二列都是升序排序的。這個類的作用是使第一列升序排序，第二列降序排序
publicstaticclass KeyComparator extends WritableComparator {
        //無參構造器必須加上，否則報錯。
protected KeyComparator() {
            super(IntPair.class, true);
        }
        @Override
        publicint compare(WritableComparable a, WritableComparable b) {
            IntPair ip1 = (IntPair) a;
            IntPair ip2 = (IntPair) b;
            //第一列按升序排序
int cmp = ip1.getFirst().compareTo(ip2.getFirst());
            if (cmp != 0) {
                return cmp;
            }
            //在第一列相等的情況下，第二列按倒序排序
return -ip1.getSecond().compareTo(ip2.getSecond());
        }
    }

/*  public static class GroupComparator extends WritableComparator {
        //無參構造器必須加上，否則報錯。
        protected GroupComparator() {
            super(IntPair.class, true);
        }
        @Override
        public int compare(WritableComparable a, WritableComparable b) {
            IntPair ip1 = (IntPair) a;
            IntPair ip2 = (IntPair) b;
            return ip1.getFirst().compareTo(ip2.getFirst());
        }
    }*///入口程式
publicstaticvoid main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(SecondarySort.class);
        //設定Mapper的相關屬性
        job.setMapperClass(TheMapper.class);
        //當Mapper中的輸出的key和value的型別和Reduce輸出的key和value的型別相同時，以下兩句可以省略。
//job.setMapOutputKeyClass(IntPair.class);
//job.setMapOutputValueClass(NullWritable.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));

        //設定分割槽的相關屬性
        job.setPartitionerClass(FirstPartitioner.class);
        //在map中對key進行排序
        job.setSortComparatorClass(KeyComparator.class);
        //job.setGroupingComparatorClass(GroupComparator.class);
//設定Reducer的相關屬性
        job.setReducerClass(TheReducer.class);
        job.setOutputKeyClass(IntPair.class);
        job.setOutputValueClass(NullWritable.class);
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        //設定Reducer數量
int reduceNum = 1;
        if(args.length >= 3 && args[2] != null){
            reduceNum = Integer.parseInt(args[2]);
        }
        job.setNumReduceTasks(reduceNum);
        job.waitForCompletion(true);
    }

}

打成secsort.jar包，從hdfs上的/test/secsortdata獲取資料檔案，mapreduce輸出目錄是/test/secsortresult8，啟動1個reduce：

hadoop jar secsort.jar /test/secsortdata /test/secsortresult8 1

測試結果：

可以發現第一列(key)是順序排列的，對於相同key的values，是倒序排列的。

如果使用兩個reduce會怎樣？

hadoop jar secsort.jar /test/secsortdata /test/secsortresult9 2

測試結果：

那如果將程式碼中的GroupComparator的註釋以及第100行的註釋去掉，結果會怎麼樣呢？

如上圖，它只會輸出每個key中的第一個value值。

寫法二，參考自http://www.superwu.cn/2013/08/18/492/：

程式碼：

IntPair類可以改寫成：

Java Code

package com.hadoop.mr.sort;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;

publicclass IntPair implements WritableComparable<IntPair> {
    privateint first = 0;
    privateint second = 0;

    publicvoid set(int first, int second) {
        this.first = first;
        this.second = second;
    }

    // 注意：需要新增無參的構造方法，否則反射時會報錯。
public IntPair() {

    }

    public IntPair(int first, int second) {
        set(first, second);
    }

    publicint getFirst() {
        return first;
    }

    publicvoid setFirst(int first) {
        this.first = first;
    }

    publicint getSecond() {
        return second;
    }

    publicvoid setSecond(int second) {
        this.second = second;
    }

    @Override
    publicvoid write(DataOutput out) throws IOException {
        out.write(first);
        out.write(second);
    }

    @Override
    publicvoid readFields(DataInput in) throws IOException {
        first = in.readInt();
        second = in.readInt();
    }

    @Override
    publicint hashCode() {
        return first + "".hashCode() + second + "".hashCode();
    }

    @Override
    publicboolean equals(Object right) {
        if (right instanceof IntPair) {
            IntPair r = (IntPair) right;
            return r.getFirst() == first && r.getSecond() == second;
        } else {
            return false;
        }
    }

    // 這裡的程式碼是關鍵，因為對key排序時，呼叫的就是這個compareTo方法
    @Override
    publicint compareTo(IntPair o) {
        if (first != o.getFirst()) {
            return first - o.getFirst();
        } elseif (second != o.getSecond()) {
            return o.getSecond() - second;
        } else {
            return0;
        }
    }
}

Secondary類可以改寫成：

Java Code

package com.hadoop.mr.sort;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

publicclass SecondarySort {
    staticclass TheMapper extends
            Mapper<LongWritable, Text, IntPair, IntWritable> {
        privatefinal IntPair outKey = new IntPair();
        privatefinal IntWritable outValue = new IntWritable();

        @Override
        protectedvoid map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            // 預設以” \t\n\r\f”（前有一個空格，引號不是）為分割符
            StringTokenizer itr = new StringTokenizer(value.toString(), "\t");
            int first = 0;
            int second = 0;
            if (itr.hasMoreTokens()) {
                first = Integer.parseInt(itr.nextToken());
                if (itr.hasMoreTokens()) {
                    second = Integer.parseInt(itr.nextToken());
                }
                outKey.set(first, second);
                outValue.set(second);
                context.write(outKey, outValue);
            }
        }
    }

    staticclass TheReducer extends
            Reducer<IntPair, IntWritable, Text, IntWritable> {
        privatestaticfinal Text SEPARATOR = new Text("------------------------------------------------");
        privatefinal Text first = new Text();
        @Override
        protectedvoid reduce(IntPair inKey, Iterable<IntWritable> inValues, Context context)
                throws IOException, InterruptedException {
            first.set(Integer.toString(inKey.getFirst()));
            for(IntWritable value: inValues) {
              context.write(first, value);
            }
            context.write(SEPARATOR, null);
        }

    }

    publicstaticclass FirstPartitioner extends
            Partitioner<IntPair, IntWritable> {
        @Override
        publicint getPartition(IntPair key, IntWritable value,int numPartitions) {
            return Math.abs(key.getFirst()* 127) % numPartitions;
        }
    }

    /**
     * 在分組比較的時候，只比較原來的key，而不是組合key。
     */publicstaticclass GroupComparator implements RawComparator<IntPair> {
      @Override
      publicint compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
        return WritableComparator.compareBytes(b1, s1, Integer.SIZE/8, b2, s2, Integer.SIZE/8);
      }
      @Override
      publicint compare(IntPair o1, IntPair o2) {
        int first1 = o1.getFirst();
        int first2 = o2.getFirst();
        return first1 - first2;
      }
    }

    // 入口程式
publicstaticvoid main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(SecondarySort.class);
        // 設定Mapper的相關屬性
        job.setMapperClass(TheMapper.class);
        // 當Mapper中的輸出的key和value的型別和Reduce輸出的key和value的型別相同時，以下兩句可以省略。
        job.setMapOutputKeyClass(IntPair.class);
        job.setMapOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        // 設定分割槽的相關屬性
        job.setPartitionerClass(FirstPartitioner.class);
        job.setGroupingComparatorClass(GroupComparator.class);
        // 設定Reducer的相關屬性
        job.setReducerClass(TheReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        // 設定Reducer數量
int reduceNum = 1;
        if (args.length >= 3 && args[2] != null) {
            reduceNum = Integer.parseInt(args[2]);
        }
        job.setNumReduceTasks(reduceNum);
        job.waitForCompletion(true);
    }
}

測試結果：

hdfs dfs -cat /test/secsortresult18/part-r-*

Text

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53

10      55
10      55
------------------------------------------------
20      84
20      21
20      11
------------------------------------------------
24      11
------------------------------------------------
30      44
------------------------------------------------
40      77
------------------------------------------------
50      87
50      67
50      54
50      53
50      52
50      51
------------------------------------------------
56      11
------------------------------------------------
60      61
60      57
60      56
60      53
60      52
60      51
------------------------------------------------
70      58
70      58
70      57
70      56
70      55
70      54
70      45
------------------------------------------------
76      32
------------------------------------------------
78      44
------------------------------------------------
80      67
------------------------------------------------
88      23
------------------------------------------------
90      55
90      43
------------------------------------------------
33      23
------------------------------------------------
91      44
91      34
------------------------------------------------

使用Spark來實現二次排序：

Scala Code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23

package com.spark.secondApp
import org.apache.spark.{SparkContext, SparkConf}

object SecondarySort {
  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName(" Secondary Sort ").setMaster("local")
    val sc = new SparkContext(conf)
    val file = sc.textFile("hdfs://worker02:9000/test/secsortdata")
    val rdd = file.map(line => line.split("\t")).
      map(x => (x(0),x(1))).groupByKey().
      sortByKey(true).map(x => (x._1,x._2.toList.sortWith(_>_)))
    val rdd2 = rdd.flatMap{
      x =>
      val len = x._2.length
      val array = new Array[(String,String)](len)
      for(i <- 0 until len) {
        array(i) = (x._1,x._2(i))
      }
      array
    }
    sc.stop()
  }
}

將8~12行復制到spark-shell中執行後，再使用rdd2.collect,結果如下：

上圖中第一列升序排列，第二列降序排列。

Hadoop實現二次排序需要近200行程式碼，而Spark只需要20多行程式碼。

原文連結：http://blog.csdn.net/u014729236/article/details/46327335

Hadoop和Spark分別實現二次排序

Hadoop和Spark分別實現二次排序

Spark:Java實現二次排序

分別使用Hadoop和Spark實現二次排序

Spark實現二次排序

大資料技術學習筆記之Hadoop框架基礎5-Hadoop高階特性HA及二次排序思想

結合案例講解MapReduce重要知識點 ------- 使用自定義MapReduce資料型別實現二次排序

43.top10熱門品類之使用Scala實現二次排序

Hadoop之MapReduce自定義二次排序流程例項詳解

hadoop 二次排序和一個java實現

Python Hadoop Mapreduce 實現Hadoop Streaming分組和二次排序

hadoop二次排序的原理和實現

一起學Hadoop——二次排序演算法的實現

Hadoop MapReduce二次排序演算法與實現之演算法解析

hadoop二次排序 (Map/Reduce中分割槽和分組的問題)

《資料演算法-Hadoop/Spark大資料處理技巧》讀書筆記（一）——二次排序

spark學習記錄（七、二次排序和分組取TopN問題）

Spark 二次排序自定義key 實現(Java)

Hadoop鏈式MapReduce、多維排序、倒排索引、自連線演算法、二次排序、Join效能優化、處理員工資訊Join實戰、URL流量分析、TopN及其排序、求平均值和最大最小值、資料清洗ETL、分析氣

MapReduce二次排序原理和實現

Hadoop 二次排序實現

Hadoop和Spark分別實現二次排序

相關推薦