1. 程式人生 > >SparkML實戰之二:Kmeans

SparkML實戰之二:Kmeans

package class8

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors

/**
 * Created by root on 16-1-12.
 */
object Kmeans {
  def main(args: Array[String]) {
    // 遮蔽不必要的日誌顯示在終端上
    Logger.getLogger
("org.apache.spark").setLevel(Level.WARN) Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF) //設定執行環境 val conf = new SparkConf().setAppName("Kmeans").setMaster("local[4]") //在叢集上執行需要設定set.Master("spark://moon:7077")並且要打包 //sc.addJar("/path/to/jarfile") val sc = new SparkContext(conf) //裝載資料集 // 0.0
0.0 0.0 // 2 0.1 0.1 0.1 // 3 0.2 0.2 0.2 // 4 9.0 9.0 9.0 // 5 9.1 9.1 9.1 // 6 9.2 9.2 9.2 val data = sc.textFile("/usr/local/spark/spark-data/data/class8/kmeans_data.txt",1) val parsedData = data.map(s=>Vectors.dense(s.split(' ').map(_.toDouble))) //將資料集聚類,2個類,20次迭代,進行模型訓練形成資料模型 val numClusters =2
val numIterations = 20 val model = KMeans.train(parsedData, numClusters, numIterations) //列印資料模型的中心點 println("Cluster centers:") for(c <-model.clusterCenters){ println(" "+c.toString) } //使用誤差平方之和來評估資料模型,--------------------------------------模型在訓練集上計算損失 val cost=model.computeCost(parsedData) println("Within Set Sum of Squared Errors ="+cost) //使用模型測試單點資料-----------------------------------------------模型對測試樣本分類 println("Vectors 0.2 0.2 0.2 is belongs to clusters:" + model.predict(Vectors.dense("0.2 0.2 0.2".split(' ').map(_.toDouble)))) //1 println("Vectors 0.25 0.25 0.25 is belongs to clusters:" + model.predict(Vectors.dense("0.25 0.25 0.25".split(' ').map(_.toDouble)))) println("Vectors 8 8 8 is belongs to clusters:" + model.predict(Vectors.dense("8 8 8".split(' ').map(_.toDouble)))) //交叉評估,之返回結果 testdata就是parseddata val testdata = data.map(s=>Vectors.dense(s.split(' ').map(_.toDouble))) val result1 = model.predict(testdata) //result1.saveAsTextFile("/usr/local/spark/spark-data/data/class8/result_kmeans1") result1.foreach(println) //交叉評估2,返回資料集和結果 // val resutl2 = data.map{ // line => // val linevectore = Vectors.dense(line.split(' ').map(_.toDouble)) // val prediction =model.predict(linevectore) // line+" "+prediction // }.saveAsTextFile("/usr/local/spark/spark-data/data/class8/result_kmeans2") sc.stop() } }