spark讀寫csv檔案
阿新 • • 發佈:2019-01-04
如果是spark1.6.0請新增maven:
<dependency>
<groupId>com.databricks</groupId>
<artifactId>spark-csv_2.10</artifactId>
<version>1.4.0</version>
<scope>compile</scope>
</dependency>
如果是spark2.0+就不用新增maven了,因為spark2.0內部集成了讀寫csv檔案。
package com.egridcloud.spark import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.{SparkConf, SparkContext} /** * Created by LHX on 2018/3/20 13:26. */ object SparkReadFile { def main(args: Array[String]): Unit = { val localpath="D:\\input\\word.csv" val outpath="D:\\output\\word2" val conf = new SparkConf() conf.setAppName("SparkReadFile") conf.setMaster("local") val sparkContext = new SparkContext(conf) val sqlContext = new SQLContext(sparkContext) //讀csv檔案 val data: DataFrame = sqlContext.read.format("com.databricks.spark.csv") .option("header", "false") //在csv第一行有屬性"true",沒有就是"false" .option("inferSchema", true.toString) //這是自動推斷屬性列的資料型別 .load(localpath) // data.show() // 寫csv檔案 data.repartition(1).write.format("com.databricks.spark.csv") .option("header", "false")//在csv第一行有屬性"true",沒有就是"false" .option("delimiter",",")//預設以","分割 .save(outpath) sparkContext.stop() } }