1. 程式人生 > >Scala語言實現Kmeans聚類演算法

Scala語言實現Kmeans聚類演算法



/**
 * @author weixu_000
 */

import java.util.Random
import scala.io.Source
import java.io._

object Kmeans {

  val k = 5
  val dim = 41                  //這是我的資料集中每一組資料的維度
  val shold = 0.0000000001      //人為設定的閾值,最後用於判斷偏移量 
  
  val centers =new Array[Vector[Double]](k)
  
  def main(args:Array[String]){
      
      //------------------------------------input data ------------------------

      val fileName = "data/testData.txt"
      val lines = Source.fromFile(fileName).getLines()
      val points =lines.map(line => {
             val parts = line.split(" ").map(_.toDouble)     //這裡需要了解map()函式的特性,為了能夠一次性排程一組資料,我們必須採用Vector型別資料
             var vector = Vector[Double]()                   //Vector型別是不可更改型別,但是可變長,可以利用這個特點將文字資料轉為以Vector為元素的陣列,即Array[Vector[Double]]型別
             for( i <- 0 to dim-1)                           //“_”這是萬用字元,使用map(),reduce()以及一些其他方法時經常用到,它表示你當前取出的元素,可以表示任何型別,所以稱為萬用字元 
             vector ++= Vector(parts(i))
             vector
      }).toArray
 
      findCenters(points)
      kmeans(points,centers)
      putout(points,centers)
      
    }
  
  //-------------------------find centers----------------------------------  
  def findCenters(points:Array[Vector[Double]])={
     val rand = new Random(System.currentTimeMillis())
     val pointsNum = points.length
     for(i <- 0 to k-1){
        centers(i) =  points(rand.nextInt(points.length)-1)
     }

     val writerCenters = new PrintWriter(new File("data/centers.txt"))
     for(i <- 0 to k-1){
     writerCenters.println(centers(i))
     }
     writerCenters.close()
   }
   
  //-----------------------------doing cluster---------------------------- 
  def kmeans(points:Array[Vector[Double]],centers:Array[Vector[Double]])={
     var bool = true
     var index = 0
     while(bool){                                                
      
       //這裡我們根據聚類中心利用groupBy()進行分組,最後得到的cluster是Map(Vector[Double],Array[Vector[Double]])型別
       //cluster共五個元素,Map中key值就是聚類中心,Value就是依賴於這個中心的點集
       val cluster = points.groupBy { closestCenter(centers,_) } 
       
       //通過Map集合的get()方法取出每一個簇,然後採用匹配方法match()進行求取新的中心,這裡再強調一遍,Vector型別是不可更改型別,即資料存入Vector以後就不能改變
       //所以需要你人為的定義Vector型別的加減乘除運算
       val newCenters = centers.map { oldCenter => 
         cluster.get(oldCenter) match{
           case Some(pointsInCluster) => 
             vectorDivide(pointsInCluster.reduceLeft(vectorAdd(_,_)),pointsInCluster.length)
           case None => oldCenter
         }
        }
    
       var movement = 0d
       for(i <- 0 to k-1){
         movement += math.sqrt(vectorDis(centers(i),newCenters(i)))
         centers(i) = newCenters(i) 
       }
       if(movement <= shold){
         bool = false
       }
      index += 1
     }
   }
  
  //---------------------------putout----------------------------------------- 
   //我們最終需要輸出的是聚類結果,我將每個點以“1,2,3,4,5”的形式輸出,屬於同一類的就是相同的數字
   //實在想不出更好的方法,只能再算一遍
   
  def putout(points:Array[Vector[Double]],centers:Array[Vector[Double]])={
     val pointsNum = points.length
     val pointLable = new Array[Int](pointsNum)
     for(i <- 0 to pointsNum-1){
        val temp = centers.reduceLeft((a,b) => 
        if ((vectorDis(a,points(i))) < (vectorDis(b,points(i))))  a
        else  b)
        pointLable(i) = centers.indexOf(temp)
     }

     val writerLable = new PrintWriter(new File("data/output.txt"))
     for(i <- 0 to pointsNum-1){
     writerLable.println(pointLable(i))
     }
      writerLable.close()
     
   }
    
  def vectorDis(v1:Vector[Double],v2:Vector[Double]):Double={
     var distance = 0d
        for(i <- 0 to dim-1){    
           distance += (v1(i)-v2(i))*(v1(i)-v2(i))
        }
        val distance = math.sqrt(t)                          
        distance
      }
   
  def vectorAdd(v1:Vector[Double],v2:Vector[Double])={
      val len=v1.length
      val av1=v1.toArray
      val av2=v2.toArray
      val av3=Array.fill(len)(0.0)
      var vector = Vector[Double]()
      for(i<-0 to len-1){
        av3(i)=av1(i)+av2(i)
        vector ++= Vector(av3(i))
      }
      vector
   }
   
  def vectorDivide(v1:Vector[Double],num:Int)={
      val av1=v1.toArray
      val len=v1.size
      val av2=Array.fill(len)(0.0)
      var vector = Vector[Double]()
      for(i<-0 to len-1){
        av2(i)=av1(i)/num
        vector ++= Vector(av2(i))
      }
      vector
   }
   
   /*
   def vectorAdd(v1:Vector[Double],v2:Vector[Double])={
     val  sumVector = Vector.fill(dim)(0.0)
        for(i <- 0 to dim-1){
          sumVector.updated(i, v1(i)+v2(i))
        }
     sumVector
   }

   def vectorDivide(v1:Vector[Double],num:Int)={
      for(i <- 0 to dim-1){
        v1.updated(i, v1(i)/num)
      }
      v1
   }
   * 
   */

  def closestCenter(centers:Array[Vector[Double]],point:Vector[Double])
   :Vector[Double]={
           centers.reduceLeft((a, b) => 
            if ((vectorDis(a,point)) < (vectorDis(b,point))) a else b
        )
        
   } 
   
  
}