機器學習之kNN分類kneighbors_classifier
阿新 • • 發佈:2018-11-29
- 機器學習之kNN分類kneighbors_classifier
# -*- coding: utf-8 -*- """ Created on Sun Nov 25 12:04:15 2018 @author: muli """ import numpy as np import matplotlib.pyplot as plt from sklearn import neighbors, datasets,cross_validation def load_classification_data(): ''' 載入分類模型使用的資料集。 :return: 一個元組,依次為:訓練樣本集、測試樣本集、訓練樣本的標記、測試樣本的標記 ''' # 使用 scikit-learn 自帶的手寫識別資料集 Digit Dataset digits=datasets.load_digits() X_train=digits.data y_train=digits.target # 進行分層取樣拆分,測試集大小佔 1/4 return cross_validation.train_test_split(X_train, y_train,test_size=0.25, random_state=0,stratify=y_train) def test_KNeighborsClassifier(*data): ''' 測試 KNeighborsClassifier 的用法 :param data: 可變引數。它是一個元組,這裡要求其元素依次為:訓練樣本集、測試樣本集、訓練樣本的標記、測試樣本的標記 :return: None ''' X_train,X_test,y_train,y_test=data print(y_train.size) print("--------------") clf=neighbors.KNeighborsClassifier() clf.fit(X_train,y_train) print("Training Score:%f"%clf.score(X_train,y_train)) print("Testing Score:%f"%clf.score(X_test,y_test)) def test_KNeighborsClassifier_k_w(*data): ''' 測試 KNeighborsClassifier 中 n_neighbors 和 weights 引數的影響 :param data: 可變引數。 它是一個元組,這裡要求其元素依次為:訓練樣本集、測試樣本集、訓練樣本的標記、測試樣本的標記 :return: None ''' X_train,X_test,y_train,y_test=data Ks=np.linspace(1,y_train.size,num=100,endpoint=False,dtype='int') weights=['uniform','distance'] fig=plt.figure() ax=fig.add_subplot(1,1,1) ### 繪製不同 weights 下, 預測得分隨 n_neighbors 的曲線 for weight in weights: training_scores=[] testing_scores=[] for K in Ks: clf=neighbors.KNeighborsClassifier(weights=weight,n_neighbors=K) clf.fit(X_train,y_train) testing_scores.append(clf.score(X_test,y_test)) training_scores.append(clf.score(X_train,y_train)) ax.plot(Ks,testing_scores,label="testing score:weight=%s"%weight) ax.plot(Ks,training_scores,label="training score:weight=%s"%weight) ax.legend(loc='best') ax.set_xlabel("K") ax.set_ylabel("score") ax.set_ylim(0,1.05) ax.set_title("KNeighborsClassifier") plt.show() def test_KNeighborsClassifier_k_p(*data): ''' 測試 KNeighborsClassifier 中 n_neighbors 和 p 引數的影響 :param data: 可變引數。它是一個元組,這裡要求其元素依次為:訓練樣本集、測試樣本集、訓練樣本的標記、測試樣本的標記 :return: None ''' X_train,X_test,y_train,y_test=data Ks=np.linspace(1,y_train.size,endpoint=False,dtype='int') Ps=[1,2,10] fig=plt.figure() ax=fig.add_subplot(1,1,1) ### 繪製不同 p 下, 預測得分隨 n_neighbors 的曲線 for P in Ps: training_scores=[] testing_scores=[] for K in Ks: clf=neighbors.KNeighborsClassifier(p=P,n_neighbors=K) clf.fit(X_train,y_train) testing_scores.append(clf.score(X_test,y_test)) training_scores.append(clf.score(X_train,y_train)) ax.plot(Ks,testing_scores,label="testing score:p=%d"%P) ax.plot(Ks,training_scores,label="training score:p=%d"%P) ax.legend(loc='best') ax.set_xlabel("K") ax.set_ylabel("score") ax.set_ylim(0,1.05) ax.set_title("KNeighborsClassifier") plt.show() if __name__=='__main__': # 獲取分類模型的資料集 X_train,X_test,y_train,y_test=load_classification_data() # 呼叫 test_KNeighborsClassifier # test_KNeighborsClassifier(X_train,X_test,y_train,y_test) # 呼叫 test_KNeighborsClassifier_k_w # test_KNeighborsClassifier_k_w(X_train,X_test,y_train,y_test) # 呼叫 test_KNeighborsClassifier_k_p test_KNeighborsClassifier_k_p(X_train,X_test,y_train,y_test)