1. 程式人生 > >python 實現周志華 機器學習書中 k-means 演算法

python 實現周志華 機器學習書中 k-means 演算法

hello,all

上節採用python實現了決策樹,本節使用python實現k-means演算法,後一節將會採用map-reduce實現k-means演算法

演算法程式如下:



演算法程式碼如下:

# coding=utf-8
import pprint
import uniout
import math
from collections import Counter
import copy as cp
import random as rd
import matplotlib
import matplotlib.pyplot as plt


'''
@author :chenyuqing
@mail   :
[email protected]
''' from numpy import * def load_data(path): ''' :param path:傳遞路徑,返回樣例的資料 :return: ''' data_set=[] file_object=open(path) for line in file_object.readlines(): lineArr = line.strip().split('\t') lineArr = [float(x) for x in lineArr] #將字串轉換成數字 data_set.append(lineArr) data_set=array(data_set) return data_set def my_kmeans(k ,data_set): ''' :param k: :param data_set: :return: ''' sample_data_index=rd.sample(list(range(0,len(data_set))),k) start_list=[] #定義起始的結果向量 end_list=[[0,0] for n in range(k)]#定義結束的向量 end_result=[[] for n in range(k)]# 分類完畢後的結果 for temp in sample_data_index: start_list.append(data_set[temp].tolist()) iter_n=10 while(start_list<>end_list): # for i in range(0,len(data_set)): temp_distance=float("inf") temp_result=0 for j in range(0,len(start_list)): distance= math.sqrt(math.pow(data_set[i][0]-start_list[j][0],2)+math.pow(data_set[i][1]-start_list[j][1],2)) if distance<temp_distance: temp_distance = distance temp_result=j #明確該點是屬於哪一個類別 end_result[temp_result].append(data_set[i].tolist()) end_list=cp.deepcopy(start_list) for i in range(0,len(end_result)): start_list[i][0]=round(sum([x[0] for x in end_result[i]])/float(len(end_result[i])) ,6) #注意這裡保留小數,不然會死迴圈,因為拷貝的時候也有精度誤差。 start_list[i][1]=round(sum([x[1] for x in end_result[i]])/float(len(end_result[i])) ,6) print "the result is :\n" ,end_result return end_result if __name__ == '__main__': print("------------my kmeans-----------") path=u"./西瓜資料集4.0.txt" data_set=load_data(path=path) print data_set result=my_kmeans(3,data_set=data_set) print result[0] print result[1] print result[2] one_x=[x[0] for x in result[0]] one_y=[x[1] for x in result[0]] two_x=[x[0] for x in result[1]] two_y=[x[1] for x in result[1]] three_x=[x[0] for x in result[2]] three_y=[x[1] for x in result[2]] plt.scatter(one_x,one_y,s=20,marker='o',color='m') plt.scatter(two_x,two_y,s=20,marker='+',color='c') plt.scatter(three_x,three_y,s=20,marker='*',color='r') plt.show()

結果如下:

[[0.697, 0.46], [0.744, 0.376], [0.634, 0.264], [0.608, 0.318], [0.639, 0.161], [0.657, 0.198], [0.719, 0.103], [0.748, 0.232], [0.714, 0.346], [0.751, 0.489], [0.725, 0.445]]
[[0.403, 0.237], [0.243, 0.267], [0.36, 0.37], [0.339, 0.241], [0.282, 0.257], [0.483, 0.312], [0.478, 0.437], [0.525, 0.369], [0.532, 0.472], [0.473, 0.376], [0.446, 0.459]]
[[0.556, 0.215], [0.481, 0.149], [0.437, 0.211], [0.666, 0.091], [0.245, 0.057], [0.343, 0.099], [0.593, 0.042], [0.359, 0.188]]
11

結果展示






採用西瓜資料集4.0:

0.6970.46
0.7440.376
0.6340.264
0.6080.318
0.5560.215
0.4030.237
0.4810.149
0.4370.211
0.6660.091
0.2430.267
0.2450.057
0.3430.099
0.6390.161
0.6570.198
0.360.37
0.5930.042
0.7190.103
0.3590.188
0.3390.241
0.2820.257
0.7480.232
0.7140.346
0.4830.312
0.4780.437
0.5250.369
0.7510.489
0.5320.472
0.4730.376
0.7250.445
0.4460.459



後續將會將其採用mr程式重新編寫,敬請關注。

ths