1. 程式人生 > >spark利用MLlib實現kmeans演算法例項

spark利用MLlib實現kmeans演算法例項

spark版本 1.3.1

scala系統環境2.10.4  程式編譯版本2.11.8

需要注意的是,訓練資料和待測試資料都要是浮點型的,如果是int型的資料的話會報錯

package Kmeans
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.{SparkContext,SparkConf}
object KmeansTest {
  var ClusterIndex=0
def main(args:Array[String
]): Unit ={ if(args.length<1){ System.err.println("Usage:<dir:trainning data> <dir:test data>") System.exit(1) } val conf=new SparkConf() val sc=new SparkContext(conf) val data=sc.textFile(args(0)) // the problem solved ,the data must be double type,not Int type
val parseddata=data.map(s=>{Vectors.dense(s.split("\t").map(_.trim).filter(!"".equals(_)).map(_.toDouble))}).cache() val numIterators=20 val numCluster=2 //trainning data val rdd = sc.makeRDD(Array(Array(1.0,10.1,2.5),Array(2.0,5.2,3.8),Array(2.1,4.0,4.4),Array(0.6,2.0,3.1))) val data1 = rdd.map(f=>Vectors.dense
(f)) val clusters=KMeans.train(data1,numCluster,numIterators) clusters.clusterCenters.foreach(x=>{println("Center index of cluster is :"+ClusterIndex+":") println(x) ClusterIndex+=1 }) // System.exit(1) parseddata.collect().foreach(dataline=>{ val clusterIndex=clusters.predict(dataline) println("The data"+dataline+"belongs to cluster"+clusterIndex) }) println("Kmeans cluster finished") sc.stop() } }