Scala語言實現Kmeans聚類演算法
阿新 • • 發佈:2019-01-03
/** * @author weixu_000 */ import java.util.Random import scala.io.Source import java.io._ object Kmeans { val k = 5 val dim = 41 //這是我的資料集中每一組資料的維度 val shold = 0.0000000001 //人為設定的閾值,最後用於判斷偏移量 val centers =new Array[Vector[Double]](k) def main(args:Array[String]){ //------------------------------------input data ------------------------ val fileName = "data/testData.txt" val lines = Source.fromFile(fileName).getLines() val points =lines.map(line => { val parts = line.split(" ").map(_.toDouble) //這裡需要了解map()函式的特性,為了能夠一次性排程一組資料,我們必須採用Vector型別資料 var vector = Vector[Double]() //Vector型別是不可更改型別,但是可變長,可以利用這個特點將文字資料轉為以Vector為元素的陣列,即Array[Vector[Double]]型別 for( i <- 0 to dim-1) //“_”這是萬用字元,使用map(),reduce()以及一些其他方法時經常用到,它表示你當前取出的元素,可以表示任何型別,所以稱為萬用字元 vector ++= Vector(parts(i)) vector }).toArray findCenters(points) kmeans(points,centers) putout(points,centers) } //-------------------------find centers---------------------------------- def findCenters(points:Array[Vector[Double]])={ val rand = new Random(System.currentTimeMillis()) val pointsNum = points.length for(i <- 0 to k-1){ centers(i) = points(rand.nextInt(points.length)-1) } val writerCenters = new PrintWriter(new File("data/centers.txt")) for(i <- 0 to k-1){ writerCenters.println(centers(i)) } writerCenters.close() } //-----------------------------doing cluster---------------------------- def kmeans(points:Array[Vector[Double]],centers:Array[Vector[Double]])={ var bool = true var index = 0 while(bool){ //這裡我們根據聚類中心利用groupBy()進行分組,最後得到的cluster是Map(Vector[Double],Array[Vector[Double]])型別 //cluster共五個元素,Map中key值就是聚類中心,Value就是依賴於這個中心的點集 val cluster = points.groupBy { closestCenter(centers,_) } //通過Map集合的get()方法取出每一個簇,然後採用匹配方法match()進行求取新的中心,這裡再強調一遍,Vector型別是不可更改型別,即資料存入Vector以後就不能改變 //所以需要你人為的定義Vector型別的加減乘除運算 val newCenters = centers.map { oldCenter => cluster.get(oldCenter) match{ case Some(pointsInCluster) => vectorDivide(pointsInCluster.reduceLeft(vectorAdd(_,_)),pointsInCluster.length) case None => oldCenter } } var movement = 0d for(i <- 0 to k-1){ movement += math.sqrt(vectorDis(centers(i),newCenters(i))) centers(i) = newCenters(i) } if(movement <= shold){ bool = false } index += 1 } } //---------------------------putout----------------------------------------- //我們最終需要輸出的是聚類結果,我將每個點以“1,2,3,4,5”的形式輸出,屬於同一類的就是相同的數字 //實在想不出更好的方法,只能再算一遍 def putout(points:Array[Vector[Double]],centers:Array[Vector[Double]])={ val pointsNum = points.length val pointLable = new Array[Int](pointsNum) for(i <- 0 to pointsNum-1){ val temp = centers.reduceLeft((a,b) => if ((vectorDis(a,points(i))) < (vectorDis(b,points(i)))) a else b) pointLable(i) = centers.indexOf(temp) } val writerLable = new PrintWriter(new File("data/output.txt")) for(i <- 0 to pointsNum-1){ writerLable.println(pointLable(i)) } writerLable.close() } def vectorDis(v1:Vector[Double],v2:Vector[Double]):Double={ var distance = 0d for(i <- 0 to dim-1){ distance += (v1(i)-v2(i))*(v1(i)-v2(i)) } val distance = math.sqrt(t) distance } def vectorAdd(v1:Vector[Double],v2:Vector[Double])={ val len=v1.length val av1=v1.toArray val av2=v2.toArray val av3=Array.fill(len)(0.0) var vector = Vector[Double]() for(i<-0 to len-1){ av3(i)=av1(i)+av2(i) vector ++= Vector(av3(i)) } vector } def vectorDivide(v1:Vector[Double],num:Int)={ val av1=v1.toArray val len=v1.size val av2=Array.fill(len)(0.0) var vector = Vector[Double]() for(i<-0 to len-1){ av2(i)=av1(i)/num vector ++= Vector(av2(i)) } vector } /* def vectorAdd(v1:Vector[Double],v2:Vector[Double])={ val sumVector = Vector.fill(dim)(0.0) for(i <- 0 to dim-1){ sumVector.updated(i, v1(i)+v2(i)) } sumVector } def vectorDivide(v1:Vector[Double],num:Int)={ for(i <- 0 to dim-1){ v1.updated(i, v1(i)/num) } v1 } * */ def closestCenter(centers:Array[Vector[Double]],point:Vector[Double]) :Vector[Double]={ centers.reduceLeft((a, b) => if ((vectorDis(a,point)) < (vectorDis(b,point))) a else b ) } }