K-means和K-means++演算法程式碼實現(Python)
阿新 • • 發佈:2019-01-30
K-means和K-means++主要區別在於,K-means++演算法選擇初始類中心時,儘可能選擇相距較遠的類中心,而K-means僅僅是隨機初始化類中心。
K-means執行結果:#K-means演算法 from pylab import * from numpy import * import codecs import matplotlib.pyplot as plt data=[] labels=[] #資料讀取 with codecs.open("data.txt","r") as f: for line in f.readlines(): x,y,label=line.strip().split('\t') data.append([float(x),float(y)]) labels.append(float(label)) datas=array(data) k=3#聚類數目 #計算歐式距離 def distance(x1,x2): return sqrt(sum(power(x1-x2,2))) #隨機初始化類中心 def randcenter(set,k): dim=shape(set)[1] init_cen=zeros((k,dim)) for i in range(dim): min_i=min(set[:,i]) range_i=float(max(set[:,i]) - min_i) init_cen[:,i]=min_i + range_i*random.rand(k) return init_cen #主程式 def Kmeans(dataset,k): row_m=shape(dataset)[0] cluster_assign=zeros((row_m,2)) center=get_centroids(dataset,k) change=True while change: change=False for i in range(row_m): mindist=inf min_index=-1 for j in range(k): distance1=distance(center[j,:],dataset[i,:]) if distance1<mindist: mindist=distance1 min_index=j if cluster_assign[i,0] != min_index: change=True cluster_assign[i,:]=min_index,mindist**2 for cen in range(k): cluster_data=dataset[nonzero(cluster_assign[:,0]==cen)] center[cen,:]=mean(cluster_data,0) return center ,cluster_assign cluster_center,cluster_assign=Kmeans(datas,k) print(cluster_center) #設定x,y軸的範圍 xlim(0, 10) ylim(0, 10) #做散點圖 f1 = plt.figure(1) plt.scatter(datas[nonzero(cluster_assign[:,0]==0),0],datas[nonzero(cluster_assign[:,0]==0),1],marker='o',color='r',label='0',s=30) plt.scatter(datas[nonzero(cluster_assign[:,0]==1),0],datas[nonzero(cluster_assign[:,0]==1),1],marker='+',color='b',label='1',s=30) plt.scatter(datas[nonzero(cluster_assign[:,0]==2),0],datas[nonzero(cluster_assign[:,0]==2),1],marker='*',color='g',label='2',s=30) plt.scatter(cluster_center[:,1],cluster_center[:,0],marker = 'x', color = 'm', s = 50) plt.show()
類中心:
[[ 7.16504475 7.12121176]
[ 2.94805141 2.84547461]
[ 4.92859254 4.93144926]]
K-means++程式執行結果:#K-means++ from pylab import * from numpy import * import codecs import matplotlib.pyplot as plt data=[] labels=[] #資料讀取 with codecs.open("data.txt","r") as f: for line in f.readlines(): x,y,label=line.strip().split('\t') data.append([float(x),float(y)]) labels.append(float(label)) datas=array(data) #計算歐氏距離 def distance(x1,x2): return sqrt(sum(power(x1-x2,2))) #對一個樣本找到與該樣本距離最近的聚類中心 def nearest(point, cluster_centers): min_dist = inf m = np.shape(cluster_centers)[0] # 當前已經初始化的聚類中心的個數 for i in range(m): # 計算point與每個聚類中心之間的距離 d = distance(point, cluster_centers[i, ]) # 選擇最短距離 if min_dist > d: min_dist = d return min_dist #選擇儘可能相距較遠的類中心 def get_centroids(dataset, k): m, n = np.shape(dataset) cluster_centers = np.zeros((k , n)) index = np.random.randint(0, m) cluster_centers[0,] = dataset[index, ] # 2、初始化一個距離的序列 d = [0.0 for _ in range(m)] for i in range(1, k): sum_all = 0 for j in range(m): # 3、對每一個樣本找到最近的聚類中心點 d[j] = nearest(dataset[j, ], cluster_centers[0:i, ]) # 4、將所有的最短距離相加 sum_all += d[j] # 5、取得sum_all之間的隨機值 sum_all *= random.rand() # 6、獲得距離最遠的樣本點作為聚類中心點 for j, di in enumerate(d): sum_all=sum_all - di if sum_all > 0: continue cluster_centers[i,] = dataset[j, ] break return cluster_centers #主程式 def Kmeans(dataset,k): row_m=shape(dataset)[0] cluster_assign=zeros((row_m,2)) center=get_centroids(dataset,k) change=True while change: change=False for i in range(row_m): mindist=inf min_index=-1 for j in range(k): distance1=distance(center[j,:],dataset[i,:]) if distance1<mindist: mindist=distance1 min_index=j if cluster_assign[i,0] != min_index: change=True cluster_assign[i,:]=min_index,mindist**2 for cen in range(k): cluster_data=dataset[nonzero(cluster_assign[:,0]==cen)] center[cen,:]=mean(cluster_data,0) return center ,cluster_assign cluster_center,cluster_assign=Kmeans(datas,3) print(cluster_center) #設定x,y軸的範圍 xlim(0, 10) ylim(0, 10) #做散點圖 f1 = plt.figure(1) plt.scatter(datas[nonzero(cluster_assign[:,0]==0),0],datas[nonzero(cluster_assign[:,0]==0),1],marker='o',color='r',label='0',s=30) plt.scatter(datas[nonzero(cluster_assign[:,0]==1),0],datas[nonzero(cluster_assign[:,0]==1),1],marker='+',color='b',label='1',s=30) plt.scatter(datas[nonzero(cluster_assign[:,0]==2),0],datas[nonzero(cluster_assign[:,0]==2),1],marker='*',color='g',label='2',s=30) plt.scatter(cluster_center[:,1],cluster_center[:,0],marker = 'x', color = 'm', s = 50) plt.show()
類中心:
[[ 4.92859254 4.93144926]
[ 2.94805141 2.84547461]
[ 7.16504475 7.12121176]]