使用spark對hive表中的多列數據判重

阿新 • • 發佈：2017-07-23

個數 stack duplicate house transient this dataframe except cti

本文處理的場景如下，hive表中的數據，對其中的多列進行判重deduplicate。

1、先解決依賴，spark相關的所有包，pom.xml

spark-hive是我們進行hive表spark處理的關鍵。

<dependencies>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.10</artifactId>
            <version> 
1.6.0</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-hive_2.10</artifactId>
            <version>1.6.0</version>
            < 
scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.10</artifactId>
            <version>1.6.0</version>
            <scope>provided</scope>
        </ 
dependency>
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.19</version>
        </dependency>
    </dependencies>

2、spark-client

package com.xiaoju.kangaroo.duplicate;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.hive.HiveContext;

import java.io.Serializable;

public class SparkClient implements Serializable{
    private SparkConf sparkConf;
    private JavaSparkContext javaSparkContext;

    public SparkClient() {
        initSparkConf();
        javaSparkContext = new JavaSparkContext(sparkConf);
    }


    public SQLContext getSQLContext() {
        return new SQLContext(javaSparkContext);
    }

    public HiveContext getHiveContext() {
        return new HiveContext(javaSparkContext);
    }

    private void initSparkConf() {
        try {
            String warehouseLocation = System.getProperty("user.dir");
            sparkConf = new SparkConf()
                    .setAppName("duplicate")
                    .set("spark.sql.warehouse.dir", warehouseLocation)
                    .setMaster("yarn-client");
        } catch (Exception ex) {
            ex.printStackTrace();
        }
    }

}

3、判重流程

package com.xiaoju.kangaroo.duplicate;

import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.hive.HiveContext;
import scala.Tuple2;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class SparkDuplicate implements Serializable  {

    private transient SparkClient sparkClient;
    private transient HiveContext hiveContext;
    private String db;
    private String tb;
    private String pt;
    private String cols;

    public SparkDuplicate(String db, String tb, String pt, String cols) {
        this.db = db;
        this.tb = tb;
        this.pt = pt;
        this.cols = cols;
        this.sparkClient = new SparkClient();
        this.hiveContext = sparkClient.getHiveContext();
    }

    public void duplicate() {
        String partition = formatPartition(pt);
        String query = String.format("select * from %s.%s where %s", db ,tb, partition);
        System.out.println(query);
        DataFrame rows = hiveContext.sql(query);
        JavaRDD<Row> rdd = rows.toJavaRDD();
        Map<String, Integer> repeatRetMap = rdd.flatMap(new FlatMapFunction<Row, String>() {
            public Iterable<String> call(Row row) throws Exception {
                HashMap<String, Object> rowMap = formatRowMap(row);
                List<String> sList = new ArrayList<String>();
                String[] colList = cols.split(",");
                for (String col : colList) {
                    sList.add(col + "@" + rowMap.get(col));
                }
                return sList;
            }
        }).mapToPair(new PairFunction<String, String, Integer>() {
            public Tuple2<String, Integer> call(String s) throws Exception {
                return new Tuple2<String, Integer>(s, 1);

            }
        }).reduceByKey(new Function2<Integer, Integer, Integer>() {
            public Integer call(Integer integer, Integer integer2) throws Exception {
                return integer + integer2;
            }
        }).map(new Function<Tuple2<String,Integer>, Map<String, Integer>>() {
            public Map<String, Integer> call(Tuple2<String, Integer> stringIntegerTuple2) throws Exception {
                Map<String, Integer> retMap = new HashMap<String, Integer>();
                if (stringIntegerTuple2._2 > 1) {
                    retMap.put(stringIntegerTuple2._1, stringIntegerTuple2._2);
                }
                return retMap;
            }
        }).reduce(new Function2<Map<String, Integer>, Map<String, Integer>, Map<String, Integer>>() {
            public Map<String, Integer> call(Map<String, Integer> stringIntegerMap, Map<String, Integer> stringIntegerMap2) throws Exception {
                stringIntegerMap.putAll(stringIntegerMap2);
                return stringIntegerMap;
            }
        });

        for (Map.Entry<String, Integer> entry : repeatRetMap.entrySet()) {
            if (entry.getValue() > 1) {
                System.out.println("重復值為:" + entry.getKey() + ", 重復個數" + entry.getValue());
            }
        }
    }

    private String formatPartition(String partition) {
        String format = "";
        if (partition.startsWith("pt") || partition.startsWith("dt")) {
            String[] items = partition.split("=");
            for (int i = 0; i < items.length; i++) {
                if (items[i].equals("pt") || items[i].equals("dt")) {
                    format += items[i];
                } else {
                    format += "=‘" + items[i] + "‘";
                }
            }
        } else {
            String[] keys;
            if (partition.contains("w=")){
                keys = new String[] {"year", "week"};
                partition = partition.replace("w=", "");
            }
            else{
                keys = new String[] {"year","month","day", "hour"};
            }
            String[] items = partition.split("/");
            for(int i=0; i<items.length; i++) {
                if (i == items.length-1) {
                    format += keys[i] + "=‘" + items[i] + "‘";
                } else {
                    format += keys[i] + "=‘" + items[i] + "‘ and ";
                }
            }
        }
        return format;
    }

    private HashMap<String, Object> formatRowMap(Row row){
        HashMap<String, Object> rowMap = new HashMap<String, Object>();
        try {
            for (int i=0; i<row.schema().fields().length; i++) {
                String colName = row.schema().fields()[i].name();
                Object colValue = row.get(i);
                rowMap.put(colName, colValue);

            }
        }catch (Exception ex) {
            ex.printStackTrace();
        }
        return rowMap;
    }

    public static void main(String[] args) {
        String db = args[0];
        String tb = args[1];
        String pt = args[2];
        String cols = args[3];
        SparkDuplicate sparkDuplicate = new SparkDuplicate(db, tb, pt, cols);
        sparkDuplicate.duplicate();
    }
}

4、運行方式

提交任務腳本

#!/bin/bash
source /etc/profile
source ~/.bash_profile
db=$1
table=$2
partition=$3
cols=$4
spark-submit     --queue=root.zhiliangbu_prod_datamonitor     --driver-memory 500M     --executor-memory 13G     --num-executors 50     spark-duplicate-1.0-SNAPSHOT-jar-with-dependencies.jar ${db} ${table} ${partition} ${cols}

運行：

sh run.sh gulfstream_ods g_order 2017/07/11 area,type

結果

重復值為:area@179, 重復個數225                                                  
重復值為:area@80, 重復個數7398
重復值為:area@82, 重復個數69823
重復值為:area@81, 重復個數98317
重復值為:area@84, 重復個數91775
重復值為:area@83, 重復個數72053
重復值為:area@180, 重復個數2362
重復值為:area@86, 重復個數264487
重復值為:area@181, 重復個數2927
重復值為:area@85, 重復個數230484
重復值為:area@88, 重復個數87527
重復值為:area@87, 重復個數74987
重復值為:area@89, 重復個數130297
重復值為:area@188, 重復個數24463
重復值為:area@189, 重復個數15699
重復值為:area@186, 重復個數13517
重復值為:area@187, 重復個數4774
重復值為:area@184, 重復個數5022
重復值為:area@185, 重復個數6737
重復值為:area@182, 重復個數12705
重復值為:area@183, 重復個數18961
重復值為:area@289, 重復個數20715
重復值為:area@168, 重復個數15179
重復值為:area@169, 重復個數1276
重復值為:area@91, 重復個數31664
重復值為:area@90, 重復個數61261
重復值為:area@93, 重復個數32496
重復值為:area@92, 重復個數55877
重復值為:area@95, 重復個數40933
重復值為:area@94, 重復個數32564
重復值為:area@290, 重復個數300
重復值為:area@97, 重復個數21405
重復值為:area@170, 重復個數37696
重復值為:area@291, 重復個數212
重復值為:area@96, 重復個數12442
重復值為:area@99, 重復個數2526
重復值為:area@98, 重復個數17456
重復值為:area@298, 重復個數12688
重復值為:area@177, 重復個數17285
重復值為:area@178, 重復個數11511
重復值為:area@299, 重復個數6622
重復值為:area@175, 重復個數9573
重復值為:area@296, 重復個數2416
重復值為:area@176, 重復個數8109
重復值為:area@297, 重復個數27915
重復值為:area@173, 重復個數58942
重復值為:area@294, 重復個數18842
重復值為:area@295, 重復個數3482
重復值為:area@174, 重復個數31452
重復值為:area@292, 重復個數11436
重復值為:area@171, 重復個數656
重復值為:area@172, 重復個數31557
重復值為:area@293, 重復個數1726
重復值為:type@1, 重復個數288479
重復值為:type@0, 重復個數21067365

使用spark對hive表中的多列數據判重

個數 stack duplicate house transient this dataframe except cti 本文處理的場景如下，hive表中的數據，對其中的多列進行判重deduplicate。 1、先解決依賴，spark相關的所有包，pom.xml spa

mysql互換表中兩列數據方法

名稱創建 sel table 進行 ras tab ont chan 1.創建表及記錄用於測試 CREATE TABLE `product` ( `id` int(10) unsigned NOT NULL AUTO_INCREMENT COMMENT ‘產品id‘

excel 比對多列數據

.html 感謝分享圖片技術 blank sheet info png inf 轉 http://club.excelhome.net/thread-1133374-1-1.html =LOOKUP(1,0/((Sheet1!$A$1:$A$10=A1)*(Sh

easyUI datagrid 多行多列數據渲染異常緩慢原因以及解決方法

後端 http 前後端公司發送請求 class 之前 vue 做的原因最近，在優化之前公司幫聯想（外包）做的一個老的後臺管理系統，由於項目是基於easy UI框架，頁面是後臺用jsp實現的，再加上在公司推行前後端分離的實踐，大部分項目都基於vue采用前後端分離去實現

快速獲取表單多條數據，使用ajax傳遞給後臺

style script 姓名 ket 對象格式 put var AD button 當表單中有多條數據需要向後臺傳遞時，一個一個的獲取顯然是不可取的辦法，可以借助表單的serialize()方法獲取。 HTML： <form id="form">

MariaDB 10.3 解決掉了UPDATE不能在同一表中查詢的數據作為同一表的更新

mark itl 8.0 tex 分享 size select ext iad MariaDB 10.3 解決掉了UPDATE不能在同一表中查詢的數據作為同一表的更新，及支持UPDATE具有相同源和目標的更新語句。下面直接看案例CREATE TABLE t1 (c1 INT

從一張表中復制數據到另一張表中

server reat clas into 復制 from insert sqlserve rom 分為兩種情況，一種是目標表不存在，另一種是目標表已存在，語法是不同的。分別以sqlserver和oracle為例，兩者略有不同。 sqlserver中，如果目標表不存在

使用一條sql查詢多個表中的記錄數

nbsp lec sel code spa select bold style 查詢方法一： select t1.num1,t2.num2,t3.num3 from (select count(*) num1 from table1) t1, (se

SQL中將某個表中的多行數據在一個字段顯示

ack 查詢 style 顯示 isp dex OS AC for 項目需求：將某個表中的多行數據在一個字段顯示，如下：比如表A中有字段 ID，NAME，表B中有字段ID，PID，DES，表A，表B中的數據分別如下： ID NAME1 張三2 李四 ID PID DE

hql語句一次比對查詢單表中多個字段

end nta 不能 ase color poll 存儲 pen ike 前端輸入客戶名稱，在使用hql查詢時，要同時比對表中，客戶名稱，客戶簡稱，拼音簡寫，客戶編碼等多個字段 hql寫法 String fdCustomerName=cv.poll("docMain.fdC

使用spark將記憶體中的資料寫入到hive表中

使用spark將記憶體中的資料寫入到hive表中 hive-site.xml <?xml version="1.0" encoding="UTF-8" standalone="no"?> <?xml-stylesheet type="text/xsl" href="configurati

利用sqoop指定列指定條件的方式將資料從mysql中增量匯入hive表中

========1、sqoop增量（指定列指定條件的方式增量匯入hive表中）匯入hive指令碼======= #!/bin/bash #Set the RDBMS connection params rdbms_ip=$1 rdbms_connect="jdbc:mysq

在cm安裝的大數據管理平臺中集成impala之後讀取hive表中的數據的設置（hue當中執行impala的數據查詢）

數據庫自動 shell bubuko div 裏的界面行操作 .com 今天裝了CM集群，在集群當中集成了impala，hive。然後一直覺得認為impala自動共享hive的元數據，最後發現好像並不是這樣的，需要經過一個同步元數據的操作才能實現數據的同步。具體的做

Hive表中四種不同資料匯出方式以及如何自定義匯出列分隔符

問題導讀： 1、Hive表資料四種匯出方式是？ 2、匯出命令中LOCAL的作用及有無的區別？ 3、匯出命令中是否可以嚮導入命令一樣使用INTO？ 4、如何自定義匯出檔案的列分隔符？ 5、hive的-e和-f引數的作用及如何使用其來匯出資料？ 6、hive shell環境中

JSON運用——PHP中使用json數據格式定義字面量對象的方法

data tro 數據格式 json格式使用 iat 進行對象人的目前，在PHP中是不支持字面量命名法。前端的小夥伴都知道，在JS中用字面量定義一個對象的方法可以如下： var o = { ‘name‘ : ‘Tom‘ , ‘url‘ : ‘www.baidu.

oracle中使用impdp數據泵導入數據提示“ORA-31684：對象類型已經存在”錯誤的解決

fun 創建用戶 spa ide pos 錯誤 markdown ber dmp 轉載請註明出處：http://blog.csdn.net/dongdong9223/article/details/47448751 本文出自【我是幹勾魚的博客

VBS讀取txt文檔數據查找Excel中單元格數據符合條件的剪切到工作表2中

msg readline style 新建 font cti pre creat ins Dim fso,f,a set oExcel = CreateObject( "Excel.Application" ) oExcel.Visible = false ‘

多線程面試題系列（16）：多線程十大經典案例之一雙線程讀寫隊列數據

als single 間隔 eas 講解 art ces 依賴 ini 前十五篇中介紹多線程的相關概念，多線程同步互斥問題（第四篇）及解決多線程同步互斥的常用方法——關鍵段、事件、互斥量、信號量、讀寫鎖。為了讓大家更加熟練運用多線程，將會有十篇文章來講解十個多線程使用案例，

jsp頁面帶有多選框的grid表格，如何將勾選中的行記錄所有列數據傳送到後臺。

blank href 後臺 aaa 數據 www hue 選中 cs6 忱分慕兇釉瀑懲防慌虜敝慘緩猩http://jz.docin.com/shuvg316 帳段繁臨市杉聞壞倫捶剎空合戀http://huiyi.docin.com/ogq2843 杏燎乘安轄任凸托飯承臨

模擬Select-Options對象實現多項數據輸入功能

按鈕 new select ont exce pub and append copy SPAN { font-family: "Courier New"; font-size: 10pt; color: #000000; background: #FFFFFF } .L0S

使用spark對hive表中的多列數據判重

相關推薦