1. 程式人生 > >二分K-means聚類(bisecting K-means)

二分K-means聚類(bisecting K-means)

複製程式碼
 1 def biKmeans(dataSet, k, distMeas=distEclud):
 2     m = shape(dataSet)[0]
 3     clusterAssment = mat(zeros((m,2)))#記錄簇分配的結果及誤差
 4     centroid0 = mean(dataSet, axis=0).tolist()[0]#計算整個資料集的質心
 5     centList =[centroid0] #create a list with one centroid
 6     for j in range(m):#計算初始聚類點與其他點的距離
 7         clusterAssment[j,1] = distMeas(mat(centroid0), dataSet[j,:])**2
 8
while (len(centList) < k): 9 lowestSSE = inf 10 for i in range(len(centList)):#嘗試劃分每一簇 11 ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A==i)[0],:]#get the data points currently in cluster i 12 centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas)#
對這個簇執行一個KMeans演算法,k=2 13 sseSplit = sum(splitClustAss[:,1])#compare the SSE to the currrent minimum 14 sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1]) 15 print "sseSplit, and notSplit: ",sseSplit,sseNotSplit 16 if (sseSplit + sseNotSplit) < lowestSSE:#
#劃分後更好的話 17 bestCentToSplit = i 18 bestNewCents = centroidMat 19 bestClustAss = splitClustAss.copy() 20 lowestSSE = sseSplit + sseNotSplit 21 bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList) #更新簇的分配結果change 1 to 3,4, or whatever 22 bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit 23 print 'the bestCentToSplit is: ',bestCentToSplit 24 print 'the len of bestClustAss is: ', len(bestClustAss) 25 centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]#replace a centroid with two best centroids 26 centList.append(bestNewCents[1,:].tolist()[0]) 27 clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:]= bestClustAss#reassign new clusters, and SSE 28 return mat(centList), clusterAssment
複製程式碼