1. 程式人生 > >spark-sql的進階案例

spark-sql的進階案例

(1)骨灰級案例--UDTF求wordcount

資料格式:
spark-sql的進階案例
每一行都是字串並且以空格分開。
程式碼實現:

object SparkSqlTest {
    def main(args: Array[String]): Unit = {
        //遮蔽多餘的日誌
        Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
        Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
        Logger.getLogger("org.project-spark").setLevel(Level.WARN)
        //構建程式設計入口
        val conf: SparkConf = new SparkConf()
        conf.setAppName("SparkSqlTest")
            .setMaster("local[2]")

        val spark: SparkSession = SparkSession.builder().config(conf)
            .enableHiveSupport()
            .getOrCreate()

        //建立sqlcontext物件
        val sqlContext: SQLContext = spark.sqlContext
        val wordDF: DataFrame = sqlContext.read.text("C:\\z_data\\test_data\\ip.txt").toDF("line")
        wordDF.createTempView("lines")
        val sql=
            """
              |select t1.word,count(1) counts
              |from (
              |select explode(split(line,'\\s+')) word
              |from lines) t1
              |group by t1.word
              |order by counts
            """.stripMargin
        spark.sql(sql).show()
    }
}

結果:
spark-sql的進階案例

(2)視窗函式求topN

資料格式:
spark-sql的進階案例
取每門課程中成績最好的前三
程式碼實現:

object SparkSqlTest {
    def main(args: Array[String]): Unit = {
        //遮蔽多餘的日誌
        Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
        Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
        Logger.getLogger("org.project-spark").setLevel(Level.WARN)
        //構建程式設計入口
        val conf: SparkConf = new SparkConf()
        conf.setAppName("SparkSqlTest")
            .setMaster("local[2]")

        val spark: SparkSession = SparkSession.builder().config(conf)
            .enableHiveSupport()
            .getOrCreate()

        //建立sqlcontext物件
        val sqlContext: SQLContext = spark.sqlContext
        val topnDF: DataFrame = sqlContext.read.json("C:\\z_data\\test_data\\score.json")
        topnDF.createTempView("student")
        val sql=
            """select
              |t1.course course,
              |t1.name name,
              |t1.score score
              |from (
              |select
              |course,
              |name,
              |score,
              |row_number() over(partition by course order by score desc ) top
              |from student) t1 where t1.top<=3
            """.stripMargin
        spark.sql(sql).show()
    }
}

結果:
spark-sql的進階案例

(3)SparkSQL去處理DataSkew資料傾斜的問題

思路: (使用兩階段的聚合)
 - 找到發生資料傾斜的key
 - 對發生傾斜的資料的key進行拆分
 - 做區域性聚合
 - 去後綴
 - 全域性聚合
以上面的wordcount為例,找出相應的資料量比較大的單詞
程式碼實現:

object SparkSqlTest {
    def main(args: Array[String]): Unit = {
        //遮蔽多餘的日誌
        Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
        Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
        Logger.getLogger("org.project-spark").setLevel(Level.WARN)
        //構建程式設計入口
        val conf: SparkConf = new SparkConf()
        conf.setAppName("SparkSqlTest")
            .setMaster("local[2]")

        val spark: SparkSession = SparkSession.builder().config(conf)
            .enableHiveSupport()
            .getOrCreate()
        //建立sqlcontext物件
        val sqlContext: SQLContext = spark.sqlContext
        //註冊UDF
        sqlContext.udf.register[String,String,Integer]("add_prefix",add_prefix)
        sqlContext.udf.register[String,String]("remove_prefix",remove_prefix)
        //建立sparkContext物件
        val sc: SparkContext = spark.sparkContext
        val lineRDD: RDD[String] = sc.textFile("C:\\z_data\\test_data\\ip.txt")
        //找出資料傾斜的單詞
        val wordsRDD: RDD[String] = lineRDD.flatMap(line => {
            line.split("\\s+")
        })
        val sampleRDD: RDD[String] = wordsRDD.sample(false,0.2)
        val sortRDD: RDD[(String, Int)] = sampleRDD.map(word=>(word,1)).reduceByKey(_+_).sortBy(kv=>kv._2,false)
        val hot_word = sortRDD.take(1)(0)._1
        val bs: Broadcast[String] = sc.broadcast(hot_word)

        import spark.implicits._
        //將資料傾斜的key打標籤
        val lineDF: DataFrame = sqlContext.read.text("C:\\z_data\\test_data\\ip.txt")
        val wordDF: Dataset[String] = lineDF.flatMap(row => {
            row.getAs[String](0).split("\\s+")
        })
        //有資料傾斜的word
        val hotDS: Dataset[String] = wordDF.filter(row => {
            val hot_word = bs.value
            row.equals(hot_word)
        })
        val hotDF: DataFrame = hotDS.toDF("word")
        hotDF.createTempView("hot_table")
        //沒有資料傾斜的word
        val norDS: Dataset[String] = wordDF.filter(row => {
            val hot_word = bs.value
            !row.equals(hot_word)
        })
        val norDF: DataFrame = norDS.toDF("word")
        norDF.createTempView("nor_table")
        var sql=
            """
              |(select
              |t3.word,
              |sum(t3.counts) counts
              |from (select
              |remove_prefix(t2.newword) word,
              |t2.counts
              |from (select
              |t1.newword newword,
              |count(1) counts
              |from
              |(select
              |add_prefix(word,3) newword
              |from hot_table) t1
              |group by t1.newword) t2) t3
              |group by t3.word)
              |union
              |(select
              | word,
              | count(1) counts
              |from nor_table
              |group by word)
            """.stripMargin
        spark.sql(sql).show()

    }
    //自定義UDF加字首
    def add_prefix(word:String,range:Integer): String ={
        val random=new Random()
        random.nextInt(range)+"_"+word
    }
    //自定義UDF去除字尾
    def remove_prefix(word:String): String ={
        word.substring(word.indexOf("_")+1)
    }
}

結果:
spark-sql的進階案例