1. 程式人生 > >用交叉驗證調整KNN模型的引數

用交叉驗證調整KNN模型的引數

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
#載入資料
def inspect_data(file_root):
    dataframe=pd.read_csv(file_root)
    print("資料基本資訊:")
    print(dataframe.info())
    print("資料有%i行,%i列"%(dataframe.shape[0],dataframe.shape[1]))
    print("資料預覽:")
    print(dataframe.head())
    return dataframe
#缺失資料處理
def processing_missing_data(dataframe):
    if dataframe.isnull().values.any():
        dataframe=dataframe.dropna()
        #dataframe=dataframe.fillna(0)
    return dataframe
#載入資料
dataframe=pd.read_csv("H:/pythonfigure/voice.csv")
#處理缺失資料
dataframe=processing_missing_data(dataframe)
#資料轉化
dataframe.replace("male",1,inplace=True)
dataframe.replace("female",0,inplace=True)
#資料準備
x=dataframe.ix[:,:-1]
y=dataframe.ix[:,-1]
#特徵歸一化
from sklearn import preprocessing
x=preprocessing.scale(x)
#分割訓練集和測試集
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=1/3.,random_state=5)
#交叉驗證
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
k_range=range(1,31)
cv_score=[]
for k in k_range:
    knn=KNeighborsClassifier(k)
    scores=cross_val_score(knn,x_train,y_train,cv=10,scoring="accuracy")
    score_mean=scores.mean()
    cv_score.append(score_mean)
    print(k,score_mean)
best_k=np.argmax(cv_score)+1
print("最優的k是%i"%(best_k))
plt.plot(k_range,cv_score)
plt.xlabel("k")
plt.ylabel("score")
plt.show()
#模型訓練
knn_model=KNeighborsClassifier(best_k)
knn_model.fit(x_train,y_train)
print("模型準確率:",knn_model.score(x_test,y_test))