Spark中元件Mllib的學習25之線性迴歸2-較大資料集(多元)
阿新 • • 發佈:2018-12-25
對多組資料進行model的training,然後再利用model來predict具體的值
。過程中有輸出model的權重
公式:f(x)=a1X1+a2X2+a3X3+……
2.程式碼:
package org.apache.spark.mllib.learning.regression
import java.text.SimpleDateFormat
import java.util.Date
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.linalg.Vectors
import org.apache .spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
import org.apache.spark.{SparkConf, SparkContext}
import scala.Array.canBuildFrom
object LinearRegression {
def main(args: Array[String]): Unit = {
// 遮蔽不必要的日誌顯示終端上
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
Logger.getLogger ("org.eclipse.jetty.server").setLevel(Level.OFF)
// 設定執行環境
val conf = new SparkConf().setAppName(this.getClass().getSimpleName().filter(!_.equals('$'))).setMaster("local[4]")
val sc = new SparkContext(conf)
// Load and parse the data
val data = sc.textFile("file/data/mllib/input/ridge-data/lpsa.data" ,1)
//如果讀入不加1,會產生兩個檔案,應該是預設生成了兩個partition
val parsedData = data.map { line =>
val parts = line.split(',')
LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
}
// Building the model
//建立model的資料和predict的資料沒有分開
val numIterations = 100
val model = LinearRegressionWithSGD.train(parsedData, numIterations)
// for(i<-parsedData) println(i.label+":"+i.features);
// Evaluate model on training examples and compute training error
val valuesAndPreds = parsedData.map { point =>
val prediction = model.predict(point.features)
(point.label, prediction)
}
//print model.weights
var weifhts=model.weights
println("model.weights"+weifhts)
//save as file
val iString = new SimpleDateFormat("yyyyMMddHHmmssSSS").format(new Date())
val path = "file/data/mllib/output/LinearRegression/" + iString + "/result"
valuesAndPreds.saveAsTextFile(path)
val MSE = valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.reduce(_ + _) / valuesAndPreds.count
println("training Mean Squared Error = " + MSE)
sc.stop()
}
}
資料請見github或者spark原始碼
3.結果:
model.weights[0.5808575763272221,0.18930001482946976,0.2803086929991066,0.1110834181777876,0.4010473965597895,-0.5603061626684255,-0.5804740464000981,0.8742741176970946]
training Mean Squared Error = 6.207597210613579