Spark SQL 筆記(7)—— DataFrame API操作案例
阿新 • • 發佈:2018-12-19
1 測試資料
stu.txt
1|Anaa|111111|[email protected]
2|Bob|22222|[email protected]
3|Candy|333333|[email protected]
4|Dany|44444|[email protected]
5|Elf|55555|[email protected]
6|Frank|6666|[email protected]
7|George|777777|[email protected]@.com
8|Hlnk|888888|[email protected]
9||999999|[email protected]
10||101010|[email protected]
11|NULL|121212|[email protected]
2 測試程式碼
package com.tzb.demo2
import org.apache.spark.sql.SparkSession
object DataFrameTest {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("DataFrameAPITest" ).master("local[2]").getOrCreate()
val rdd = spark.sparkContext.textFile("file:///d://stu.txt")
import spark.implicits._
val stuDF = rdd.map(_.split("\\|")).map(line => {
Stu(line(0).toInt, line(1), line(2), line(3))
}).toDF()
//預設只顯示20條
// stuDF.show()
//stuDF.take(10).foreach( println)
stuDF.first()
stuDF.head(3)
//stuDF.select("email").show(20,false)
//stuDF.select("name","email").show(20,false)
// stuDF.filter("name=''").show()
//stuDF.filter("name='' OR name='NULL'").show()
//檢視 name 以 H開頭的人
//stuDF.filter("SUBSTR(name,0,1)='H'").show()
//stuDF.sort(stuDF("name")).show()
//stuDF.sort(stuDF("name").desc).show()
//stuDF.sort(stuDF("name").asc,stuDF("id").desc).show()
val stuDF2 = rdd.map(_.split("\\|")).map(line => {
Stu(line(0).toInt, line(1), line(2), line(3))
}).toDF()
stuDF.join(stuDF2,stuDF.col("id")===stuDF2.col("id")).show()
spark.stop()
}
case class Stu(id: Int, name: String, phone: String, email: String)
}