1. 程式人生 > >聚類之均值聚類(k-means)演算法的python實現

聚類之均值聚類(k-means)演算法的python實現

# -*- coding: UTF-8 -*-
import numpy
import random
import codecs
import copy
import re
import matplotlib.pyplot as plt

def calcuDistance(vec1, vec2):
    # 計算向量vec1和向量vec2之間的歐氏距離
    return numpy.sqrt(numpy.sum(numpy.square(vec1 - vec2)))

def loadDataSet(inFile):
	# 載入資料測試資料集
    # 資料由文字儲存,為二維座標
    inDate = codecs.open(inFile, 'r', 'utf-8').readlines()
    dataSet = list()
    for line in inDate:
    	line = line.strip()
    	strList = re.split('[ ]+', line)  # 去除多餘的空格
    	# print strList[0], strList[1]
    	numList = list()
    	for item in strList:
    		num = float(item)
    		numList.append(num)
    		# print numList
    	dataSet.append(numList)

    return dataSet      # dataSet = [[], [], [], ...]

def initCentroids(dataSet, k):
	# 初始化k個質心,隨機獲取
	return random.sample(dataSet, k)  # 從dataSet中隨機獲取k個數據項返回

def minDistance(dataSet, centroidList):
    # 對每個屬於dataSet的item,計算item與centroidList中k個質心的歐式距離,找出距離最小的,
    # 並將item加入相應的簇類中

	clusterDict = dict()                 # 用dict來儲存簇類結果
	for item in dataSet:
		vec1 = numpy.array(item)         # 轉換成array形式
		flag = 0                         # 簇分類標記,記錄與相應簇距離最近的那個簇
		minDis = float("inf")            # 初始化為最大值

		for i in range(len(centroidList)):
			vec2 = numpy.array(centroidList[i])
			distance = calcuDistance(vec1, vec2)  # 計算相應的歐式距離
			if distance < minDis:    
				minDis = distance
				flag = i                          # 迴圈結束時,flag儲存的是與當前item距離最近的那個簇標記

		if flag not in clusterDict.keys():   # 簇標記不存在,進行初始化
			clusterDict[flag] = list()
		# print flag, item
		clusterDict[flag].append(item)       # 加入相應的類別中

	return clusterDict                       # 返回新的聚類結果

def getCentroids(clusterDict):
    # 得到k個質心
    centroidList = list()
    for key in clusterDict.keys():
        centroid = numpy.mean(numpy.array(clusterDict[key]), axis = 0)  # 計算每列的均值,即找到質心
        # print key, centroid
        centroidList.append(centroid)
    
    return numpy.array(centroidList).tolist()

def getVar(clusterDict, centroidList):
    # 計算簇集合間的均方誤差
    # 將簇類中各個向量與質心的距離進行累加求和

    sum = 0.0
    for key in clusterDict.keys():
        vec1 = numpy.array(centroidList[key])
        distance = 0.0
        for item in clusterDict[key]:
            vec2 = numpy.array(item)
            distance += calcuDistance(vec1, vec2)
        sum += distance

    return sum

def showCluster(centroidList, clusterDict):
    # 展示聚類結果

    colorMark = ['or', 'ob', 'og', 'ok', 'oy', 'ow']      # 不同簇類的標記 'or' --> 'o'代表圓,'r'代表red,'b':blue
    centroidMark = ['dr', 'db', 'dg', 'dk', 'dy', 'dw']   # 質心標記 同上'd'代表稜形
    for key in clusterDict.keys():
        plt.plot(centroidList[key][0], centroidList[key][1], centroidMark[key], markersize = 12)  # 畫質心點
        for item in clusterDict[key]:
            plt.plot(item[0], item[1], colorMark[key]) # 畫簇類下的點

    plt.show()

if __name__ == '__main__':

    inFile = "D:/ML/clustering/testSet.txt"            # 資料集檔案 
    dataSet = loadDataSet(inFile)                      # 載入資料集
    centroidList = initCentroids(dataSet, 4)           # 初始化質心,設定k=4
    clusterDict = minDistance(dataSet, centroidList)   # 第一次聚類迭代
    newVar = getVar(clusterDict, centroidList)         # 獲得均方誤差值,通過新舊均方誤差來獲得迭代終止條件
    oldVar = -0.0001                                   # 舊均方誤差值初始化為-1
    print '***** 第1次迭代 *****'
    print 
    print '簇類'
    for key in clusterDict.keys():
        print key, ' --> ', clusterDict[key]
    print 'k個均值向量: ', centroidList
    print '平均均方誤差: ', newVar
    print 
    showCluster(centroidList, clusterDict)             # 展示聚類結果

    k = 2
    while abs(newVar - oldVar) >= 0.0001:              # 當連續兩次聚類結果小於0.0001時,迭代結束          
        centroidList = getCentroids(clusterDict)          # 獲得新的質心
        clusterDict = minDistance(dataSet, centroidList)  # 新的聚類結果
        oldVar = newVar                                   
        newVar = getVar(clusterDict, centroidList)

        print '***** 第%d次迭代 *****' % k
        print 
        print '簇類'
        for key in clusterDict.keys():
            print key, ' --> ', clusterDict[key]
        print 'k個均值向量: ', centroidList
        print '平均均方誤差: ', newVar
        print
        showCluster(centroidList, clusterDict)            # 展示聚類結果

        k += 1