1. 程式人生 > >機器學習之K-最近鄰規則分類(KNN)演算法

機器學習之K-最近鄰規則分類(KNN)演算法

準備分為兩個部分,一個是理論,一個就是程式碼實現。程式碼也可以在我的GitHub上下載,後面有連結。

一、理論知識

相信我的筆記還是比較詳細的
這裡寫圖片描述

二、程式碼實現KNN演算法

1. 首先要生成一些資料集,以供訓練和測試

我造的資料是關於通過身高等資訊預測女生是什麼型別的資料(純屬扯淡的,不要當真)。屬性是身高、體重、腿長,分類標籤是[model,common,lolita]。資料量大小控制為150。
程式碼如下:

# -*- coding: utf-8 -*-
import random

def generate_data(min1,max1,min2,max2,min3,max3,len)
:
info = [] for i in range(len): buf = [] height = random.uniform(min1, max1) weight = random.uniform(min2, max2)#可能體重作為屬性不太好,因為不同身高段的人體重可能在同一個小範圍內。僅作為試驗 leg_length = random.uniform(min3, max3) buf.append(height) buf.append(weight) buf.append(leg_length) info.append(buf) return
info def save(info,label): handle = open('data.txt','a+') for i in info: print i[0],i[1],i[2] handle.write(str(i[0])+' ') handle.write(str(i[1])+' ') handle.write(str(i[2])+' ') handle.write(label) handle.write('\n') handle.close() if __name__ == '__main__'
: model = generate_data(169.9,180,90,110,105,111,50) lolita = generate_data(155,162,85,98,95,100,50) common = generate_data(162,169,80,115,100,104.5,50) save(model,'model') save(lolita,'lolita') save(common,'common')

得到的資料如下:
這裡寫圖片描述

2. 利用KNN進行分類

# -*- coding: utf-8 -*-
import random
import math
def get_train_data():
    test = []
    train = []
    handle = open('data.txt')
    for line in handle:
        line = line.strip().split()
        if random.random() < 0.333333333333:
            test.append(line)
        else:
            train.append(line)
    return train,test
def cal_distance(a,b,dimension):#用的是歐幾里得距離
    d = 0
    for i in range(dimension):
        d += (float(a[i])-float(b[i]))*(float(a[i])-float(b[i]))
    d = math.sqrt(d)
    return d
def get_neighbor(K,test,train_data):
    distance = []
    neighbor = []
    for i in range(len(train_data)):
        info = []
        l = cal_distance(test,train_data[i],3)
        label = train_data[i][-1]
        info.append(l)
        info.append(label)
        distance.append(info)
    distance = sorted(distance)
    for i in range(K):
        neighbor.append(distance[i])
    return neighbor

def get_prediction(K,neighbor):
    model_num = 0
    common_num = 0
    lolita_num = 0
    for i in range(K):
        if neighbor[i][1] == 'model':
            model_num += 1
        elif neighbor[i][1] == 'common':
            common_num += 1
        else:
            lolita_num += 1
    if model_num > common_num and model_num > lolita_num:
        prediction = 'model'
    elif lolita_num > model_num and lolita_num > common_num:
        prediction = 'lolita'
    else:
        prediction = 'common'
    return prediction
def evaluate_Accuracy(test,train,K):
    neighbor_set = []
    prediction_set = []
    for i in range(len(test)):
        neighbor_buf = get_neighbor(K,test[i],train)
        prediction = get_prediction(K,neighbor_buf)
        prediction_set.append(prediction)
        neighbor_set.append(neighbor_buf)
    #print neighbor_set[0]
    #print prediction_set
    right = 0
    for i in range(len(prediction_set)):
        if prediction_set[i] == test[i][-1]:
            right += 1
    precision = float(right)/float(len(prediction_set))
    return precision

if __name__ == '__main__':
    K = 5
    train_data, test_data = get_train_data() #得到訓練集與測試集
    #print train_data
    #print test_data
    precision = evaluate_Accuracy(test_data, train_data, K) #評定精度
    print precision

    test = [172, 96, 106]#測試一個第三方樣本
    neighbor = get_neighbor(K,test,train_data) #得到最近鄰的K個值
    prediction = get_prediction(K,neighbor) #對樣本作預測
    print 'Girl Type:', prediction

結果如下:
這裡寫圖片描述