1. 程式人生 > >K-means和K-means++演算法程式碼實現(Python)

K-means和K-means++演算法程式碼實現(Python)

K-means和K-means++主要區別在於,K-means++演算法選擇初始類中心時,儘可能選擇相距較遠的類中心,而K-means僅僅是隨機初始化類中心。

#K-means演算法
from pylab import *
from numpy import *
import codecs
import matplotlib.pyplot as plt
data=[]
labels=[]
#資料讀取
with codecs.open("data.txt","r") as f:
    for line in f.readlines():
        x,y,label=line.strip().split('\t')
        data.append([float(x),float(y)])
        labels.append(float(label))
datas=array(data)
k=3#聚類數目
#計算歐式距離
def distance(x1,x2):
    return sqrt(sum(power(x1-x2,2)))
#隨機初始化類中心
def randcenter(set,k):
    dim=shape(set)[1]
    init_cen=zeros((k,dim))
    for i in range(dim):
        min_i=min(set[:,i])
        range_i=float(max(set[:,i]) - min_i)
        init_cen[:,i]=min_i + range_i*random.rand(k)
    return init_cen
#主程式
def Kmeans(dataset,k):
    row_m=shape(dataset)[0]
    cluster_assign=zeros((row_m,2))
    center=get_centroids(dataset,k)
    change=True
    while change:
        change=False
        for i in range(row_m):
            mindist=inf
            min_index=-1
            for j in range(k):
                distance1=distance(center[j,:],dataset[i,:])
                if distance1<mindist:
                    mindist=distance1
                    min_index=j
            if cluster_assign[i,0] != min_index:
                change=True
            cluster_assign[i,:]=min_index,mindist**2
        for cen in range(k):
            cluster_data=dataset[nonzero(cluster_assign[:,0]==cen)]
            center[cen,:]=mean(cluster_data,0)
    return center ,cluster_assign
cluster_center,cluster_assign=Kmeans(datas,k)
print(cluster_center)
#設定x,y軸的範圍
xlim(0, 10)
ylim(0, 10)
#做散點圖
f1 = plt.figure(1)
plt.scatter(datas[nonzero(cluster_assign[:,0]==0),0],datas[nonzero(cluster_assign[:,0]==0),1],marker='o',color='r',label='0',s=30)
plt.scatter(datas[nonzero(cluster_assign[:,0]==1),0],datas[nonzero(cluster_assign[:,0]==1),1],marker='+',color='b',label='1',s=30)
plt.scatter(datas[nonzero(cluster_assign[:,0]==2),0],datas[nonzero(cluster_assign[:,0]==2),1],marker='*',color='g',label='2',s=30)
plt.scatter(cluster_center[:,1],cluster_center[:,0],marker = 'x', color = 'm', s = 50)
plt.show()
K-means執行結果:

   類中心:

    [[ 7.16504475  7.12121176]
     [ 2.94805141  2.84547461]
     [ 4.92859254  4.93144926]]

#K-means++
from pylab import *
from numpy import *
import codecs
import matplotlib.pyplot as plt
data=[]
labels=[]
#資料讀取
with codecs.open("data.txt","r") as f:
    for line in f.readlines():
        x,y,label=line.strip().split('\t')
        data.append([float(x),float(y)])
        labels.append(float(label))
datas=array(data)

#計算歐氏距離
def distance(x1,x2):
    return sqrt(sum(power(x1-x2,2)))

#對一個樣本找到與該樣本距離最近的聚類中心
def nearest(point, cluster_centers):
    min_dist = inf
    m = np.shape(cluster_centers)[0]  # 當前已經初始化的聚類中心的個數
    for i in range(m):
        # 計算point與每個聚類中心之間的距離
        d = distance(point, cluster_centers[i, ])
        # 選擇最短距離
        if min_dist > d:
            min_dist = d
    return min_dist
#選擇儘可能相距較遠的類中心
def get_centroids(dataset, k):
    m, n = np.shape(dataset)
    cluster_centers = np.zeros((k , n))
    index = np.random.randint(0, m)
    cluster_centers[0,] = dataset[index, ]
    # 2、初始化一個距離的序列
    d = [0.0 for _ in range(m)]
    for i in range(1, k):
        sum_all = 0
        for j in range(m):
            # 3、對每一個樣本找到最近的聚類中心點
            d[j] = nearest(dataset[j, ], cluster_centers[0:i, ])
            # 4、將所有的最短距離相加
            sum_all += d[j]
        # 5、取得sum_all之間的隨機值
        sum_all *= random.rand()
        # 6、獲得距離最遠的樣本點作為聚類中心點
        for j, di in enumerate(d):
            sum_all=sum_all - di
            if sum_all > 0:
                continue
            cluster_centers[i,] = dataset[j, ]
            break
    return cluster_centers

#主程式
def Kmeans(dataset,k):
    row_m=shape(dataset)[0]
    cluster_assign=zeros((row_m,2))
    center=get_centroids(dataset,k)
    change=True
    while change:
        change=False
        for i in range(row_m):
            mindist=inf
            min_index=-1
            for j in range(k):
                distance1=distance(center[j,:],dataset[i,:])
                if distance1<mindist:
                    mindist=distance1
                    min_index=j
            if cluster_assign[i,0] != min_index:
                change=True
            cluster_assign[i,:]=min_index,mindist**2
        for cen in range(k):
            cluster_data=dataset[nonzero(cluster_assign[:,0]==cen)]
            center[cen,:]=mean(cluster_data,0)
    return center ,cluster_assign
cluster_center,cluster_assign=Kmeans(datas,3)
print(cluster_center)

#設定x,y軸的範圍
xlim(0, 10)
ylim(0, 10)
#做散點圖
f1 = plt.figure(1)
plt.scatter(datas[nonzero(cluster_assign[:,0]==0),0],datas[nonzero(cluster_assign[:,0]==0),1],marker='o',color='r',label='0',s=30)
plt.scatter(datas[nonzero(cluster_assign[:,0]==1),0],datas[nonzero(cluster_assign[:,0]==1),1],marker='+',color='b',label='1',s=30)
plt.scatter(datas[nonzero(cluster_assign[:,0]==2),0],datas[nonzero(cluster_assign[:,0]==2),1],marker='*',color='g',label='2',s=30)
plt.scatter(cluster_center[:,1],cluster_center[:,0],marker = 'x', color = 'm', s = 50)
plt.show() 
K-means++程式執行結果:

   類中心:

   [[ 4.92859254  4.93144926]
    [ 2.94805141  2.84547461]
    [ 7.16504475  7.12121176]]