1. 程式人生 > >kMeans聚類的python實現

kMeans聚類的python實現

from numpy import *
import matplotlib.pyplot as plt

#輔助函式
#載入資料集
def loadDataSet(filename):
    dataMat = []
    f = open(filename)
    for line in f.readlines():
        curLine = line.strip().split('\t')
        #python3.之後需要用list(map())
        fltLine = list(map(float,curLine))
        dataMat.append(fltLine)
    return
dataMat #返回兩個點的歐氏距離 def distEclud(vecA,vecB): return sqrt(sum(power(vecA-vecB,2))) #構建一個包含k個隨機質心的集合 def randCent(dataSet,k): #獲取每一位維的度數 n = shape(dataSet)[1] #生成(k,n)維空陣列矩陣 centroids = mat(zeros((k,n))) #在minJ到maxJ之間生成隨機質心填充 centroids for j in range(n): minJ = min(dataSet[:,j]) rangeJ = float(max(dataSet[:,j]) - minJ) centroids[:,j] = minJ + rangeJ * random.rand(k,1
) return centroids def kMeans(dataSet,k,dist = distEclud, createCent = randCent): m = shape(dataSet)[0] #長度為m的label陣列 label = zeros((1,m))[0] centroids = createCent(dataSet,k) clusterChanged = True while clusterChanged: clusterChanged = False #為每個點尋找最近的質心 for
i in range(m): minDist = inf; minIndex = -1; for j in range(k): distJI = dist(centroids[j,:],dataSet[i,:]) if distJI < minDist: minDist = distJI;minIndex = j if(label[i] != minIndex): clusterChanged = True label[i] = minIndex print(centroids) #重新計算質心的位置 for cent in range(k): ptsInclust = dataSet[nonzero(label == cent)[0]] centroids[cent,:] = mean(ptsInclust,axis = 0) return centroids , label if __name__ == '__main__': k = 4 filename = 'testSet.txt' dataSet = loadDataSet(filename) dataArray = array(dataSet) #dataMat = mat(loadDataSet(filename)) #plt.plot(dataArray[:,0],dataArray[:,1],'o') centroids,label = kMeans(dataArray,k) str = 'o*s^' color = 'bgrc' for i in range(len(label)): ch = str[int(label[i])] co = color[int(label[i])] plt.plot(dataArray[i,0],dataArray[i,1],color =co ,marker = ch) for i in range(len(centroids)): plt.plot(centroids[i,0],centroids[i,1],'k+')