1. 程式人生 > >機器學習之kNN分類kneighbors_classifier

機器學習之kNN分類kneighbors_classifier

  • 機器學習之kNN分類kneighbors_classifier
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 25 12:04:15 2018

@author: muli
"""

import numpy as np
import matplotlib.pyplot as plt
from sklearn import neighbors, datasets,cross_validation


def load_classification_data():
    '''
    載入分類模型使用的資料集。

    :return: 一個元組,依次為:訓練樣本集、測試樣本集、訓練樣本的標記、測試樣本的標記
    '''
    # 使用 scikit-learn 自帶的手寫識別資料集 Digit Dataset
    digits=datasets.load_digits() 
    X_train=digits.data
    y_train=digits.target
    # 進行分層取樣拆分,測試集大小佔 1/4
    return cross_validation.train_test_split(X_train, y_train,test_size=0.25,
            random_state=0,stratify=y_train)


def test_KNeighborsClassifier(*data):
    '''
    測試 KNeighborsClassifier 的用法

    :param data: 可變引數。它是一個元組,這裡要求其元素依次為:訓練樣本集、測試樣本集、訓練樣本的標記、測試樣本的標記
    :return: None
    '''
    X_train,X_test,y_train,y_test=data
    print(y_train.size)
    print("--------------")
    clf=neighbors.KNeighborsClassifier()
    clf.fit(X_train,y_train)
    print("Training Score:%f"%clf.score(X_train,y_train))
    print("Testing Score:%f"%clf.score(X_test,y_test))


def test_KNeighborsClassifier_k_w(*data):
    '''
    測試 KNeighborsClassifier 中 n_neighbors 和 weights 引數的影響

    :param data: 可變引數。
    它是一個元組,這裡要求其元素依次為:訓練樣本集、測試樣本集、訓練樣本的標記、測試樣本的標記
    :return: None
    '''
    X_train,X_test,y_train,y_test=data
    Ks=np.linspace(1,y_train.size,num=100,endpoint=False,dtype='int')
    weights=['uniform','distance']

    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    ### 繪製不同 weights 下, 預測得分隨 n_neighbors 的曲線
    for weight in weights:
        training_scores=[]
        testing_scores=[]
        for K in Ks:
            clf=neighbors.KNeighborsClassifier(weights=weight,n_neighbors=K)
            clf.fit(X_train,y_train)
            testing_scores.append(clf.score(X_test,y_test))
            training_scores.append(clf.score(X_train,y_train))
        ax.plot(Ks,testing_scores,label="testing score:weight=%s"%weight)
        ax.plot(Ks,training_scores,label="training score:weight=%s"%weight)
    ax.legend(loc='best')
    ax.set_xlabel("K")
    ax.set_ylabel("score")
    ax.set_ylim(0,1.05)
    ax.set_title("KNeighborsClassifier")
    plt.show()


def test_KNeighborsClassifier_k_p(*data):
    '''
    測試 KNeighborsClassifier 中 n_neighbors 和 p 引數的影響

    :param data: 可變引數。它是一個元組,這裡要求其元素依次為:訓練樣本集、測試樣本集、訓練樣本的標記、測試樣本的標記
    :return: None
    '''
    X_train,X_test,y_train,y_test=data
    Ks=np.linspace(1,y_train.size,endpoint=False,dtype='int')
    Ps=[1,2,10]

    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    ### 繪製不同 p 下, 預測得分隨 n_neighbors 的曲線
    for P in Ps:
        training_scores=[]
        testing_scores=[]
        for K in Ks:
            clf=neighbors.KNeighborsClassifier(p=P,n_neighbors=K)
            clf.fit(X_train,y_train)
            testing_scores.append(clf.score(X_test,y_test))
            training_scores.append(clf.score(X_train,y_train))
        ax.plot(Ks,testing_scores,label="testing score:p=%d"%P)
        ax.plot(Ks,training_scores,label="training score:p=%d"%P)
    ax.legend(loc='best')
    ax.set_xlabel("K")
    ax.set_ylabel("score")
    ax.set_ylim(0,1.05)
    ax.set_title("KNeighborsClassifier")
    plt.show()



if __name__=='__main__':
    # 獲取分類模型的資料集
    X_train,X_test,y_train,y_test=load_classification_data() 
    # 呼叫 test_KNeighborsClassifier
#    test_KNeighborsClassifier(X_train,X_test,y_train,y_test) 
    # 呼叫 test_KNeighborsClassifier_k_w
#    test_KNeighborsClassifier_k_w(X_train,X_test,y_train,y_test)
    # 呼叫 test_KNeighborsClassifier_k_p
    test_KNeighborsClassifier_k_p(X_train,X_test,y_train,y_test)