1. 程式人生 > >scikit-learn中gridSearchCV的使用

scikit-learn中gridSearchCV的使用

步驟:

1.選擇並構建訓練模型model

2.將訓練模型model投入到GridSearchCV中,得到GridSearchCV模型grid_model

3.用grid_model擬合訓練集資料,選擇在validation_dataset上效果最好的引數的模型best_estimator

4.1.用best_estimator擬合訓練集(得到的結果應該與之前不同,因為之前用交叉驗證等方法對訓練集進行了分割)

4.2.用best_estimator擬合測試集

5.結果視覺化:AUC曲線,AUPR曲線

一.資料

【資料準備】

Size Size
訓練集 (1206, 294) (1206,)
測試集 (64, 294) (64,)

二.主模型

【搭建環境】

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.metrics import auc,roc_auc_score,roc_curve,precision_recall_curve
from sklearn.metrics import confusion_matrix,make_scorer

【模型準備】

seed = 1231
np.random.seed(seed)
x_train,y_train,x_test,y_test = x_train,y_train,x_test,y_test

names = ['Decision Tree', 'Random Forest']
classifiers = [DecisionTreeClassifier(),RandomForestClassifier()]

parameter_dtc = {'max_features':['auto','sqrt','log2',None],'max_depth':range(3,100,2)}
parameter_rfc = {'n_estimators':range(5,200,5),'max_features':['auto','sqrt','log2',None],'max_depth':range(3,100,2)}

parameters = [parameter_dtc,parameter_rfc,parameter_ada_dtc,parameter_mlp]

scoring = {'roc_auc':'roc_auc','accuracy':'accuracy', 'precision':'precision','recall':'recall','f1':'f1'}

【主模型函式】

def gird_search_model(clf,param,name,x_train,y_train,x_test,y_test): #clf-classifier;param-parameter;name-classifier_name
    model = GridSearchCV(clf,param,cv=5,verbose=2,scoring=scoring,refit='roc_auc',n_jobs=-1,return_train_score=True) #GridSearchCV模型
    fit = model.fit(x_train,y_train) #GridSearchCV模型擬合訓練集資料,並返回訓練器集合為fit
    y_train_pred = fit.best_estimator_.predict(x_train) #用訓練器集合中最好的estimator預測y_train_pred
    y_test_pred = fit.best_estimator_.predict(x_test) #用訓練器集合中最好的estimator預測y_test_pred
    
    cv_results = pd.DataFrame(fit.cv_results_).set_index(['params']) #將訓練器集合fit的cv_results儲存為df格式
    cv_results_mean = cv_results[['mean_train_accuracy', 'mean_train_f1','mean_train_precision', 'mean_train_recall', 'mean_train_roc_auc',
                                  'mean_test_accuracy','mean_test_f1', 'mean_test_precision', 'mean_test_recall','mean_test_roc_auc']] #cv_results中的各個score的mean
    cv_results_std = cv_results[['std_train_accuracy', 'std_train_f1', 'std_train_precision','std_train_recall', 'std_train_roc_auc',
                                 'std_test_accuracy', 'std_test_f1','std_test_precision', 'std_test_recall', 'std_test_roc_auc']] #cv_results中的各個score的std

#-------------------模型結果展示------------------------------------------------------  
  
    print('MODEL : %r' % name)
    print('Best cv_test_roc_auc: %f using %s' % (fit.best_score_,fit.best_params_)) #訓練器集合fit中最好的模型得到的:best_score和best_params
    print(cv_results_mean)
    print(cv_results_std)
    
    train_score_list = []
    test_score_list = []
    score_list = []
    model_metrics_name = [accuracy_score,precision_score,recall_score,f1_score,roc_auc_score,aupr] #模型評價指標,與scoreing相對應
    for matrix in model_metrics_name: #計算各個模型評價指標
        train_score = matrix(y_train,y_train_pred) #計算訓練集的
        test_score = matrix(y_test,y_test_pred) #計算測試集的
        train_score_list.append(train_score) #把訓練集的各個模型指標放在同一行
        test_score_list.append(test_score) #把測試集的各個模型指標放在同一行
    score_list.append(train_score_list) #合併訓練集和測試集的結果(便於展示)
    score_list.append(test_score_list) #合併訓練集和測試集的結果(便於展示)
    score_df = pd.DataFrame(score_list,index = ['train','test'],columns = ['accuracy','precision','recall','f1','roc_auc','aupr']) #將結果顯示為df格式,加上行列index
    print('EVALUATE_METRICS:')
    print(score_df)   
    return cv_results,score_list,y_train_pred,y_test_pred

 【單個模型執行過程】

【單個模型執行結果】

【多個模型迴圈執行】

train_score_list = []
test_score_list = []
y_train_pred_list = []
y_test_pred_list = []
for clf,param,name in zip(classifiers,parameters,names):
    cv_result,score_list,y_train_pred,y_test_pred =  gird_search_model(clf,param,name,x_train,y_train,x_test,y_test) #執行主模型函式
    train_score_list.append(score_list[0])
    test_score_list.append(score_list[1])
    y_train_pred_list.append(y_train_pred)
    y_test_pred_list.append(y_test_pred)
    print('-------------------------------------------------------------------------------------------------------------------------------')
train_score_df = pd.DataFrame(train_score_list,index=names,columns=['acc','pre','rec','f1','roc_auc','aupr'])
test_score_df = pd.DataFrame(test_score_list,index=names,columns=['acc','pre','rec','f1','roc_auc','aupr'])
print('TRAIN_SCORE:')
print(train_score_df)
print()
print('TEST_SCORE:')
print(test_score_df)

【多個模型執行結果】

三.畫AUC和PRC圖

【主函式】

for clf_name,y_train_pred,y_test_pred in zip(names,y_train_pred_list,y_test_pred_list):
    show_curve(y_train,y_train_pred,clf_name,True)
    show_curve(y_test,y_test_pred,clf_name,False)

【結果】

 

四.子函式(主程式內的,應該寫在最前面,本文為便於理解,放在最後)

1.模型評估函式裡有一個aupr(precision-recall-curve的曲線下面積):當正負樣本不平衡時使用aupr評估比auc好。

def aupr(y_true,y_pred):
    precision, recall, thresholds = precision_recall_curve(y_true,y_pred)
    roc_aupr = auc(recall,precision) 
    return roc_aupr

2.如果想使用混淆矩陣作為GridSearchCV模型中的scoring,需要用make_scorer轉換一下。

def tn(y_true,y_pred): return confusion_matrix(y_true,y_pred)[0,0]
def fp(y_true,y_pred): return confusion_matrix(y_true,y_pred)[0,1]
def fn(y_true,y_pred): return confusion_matrix(y_true,y_pred)[1,0]
def tp(y_true,y_pred): return confusion_matrix(y_true,y_pred)[1,1]
make_score = {'tp':make_scorer(tp),'tn':make_scorer(tn),'fp':make_scorer(fp),'fn':make_scorer(fn)}

3.畫圖_步1:AUCPRC曲線

import matplotlib.pyplot as plt
def show_roc(roc_auc,fpr,tpr):
    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--',label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot(fpr, tpr)
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    plt.show()

def show_roc_pr(roc_aupr,recall,precision):
    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--',label='ROC curve (area = %0.2f)' % roc_aupr)
    plt.plot(recall, precision)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('ROC_PR curve')
    plt.legend(loc='best')
    plt.show()
    print()

4.畫圖_步2:AUCPRC曲線

def show_curve(y_true,y_pred,clf_name,train=True):
    fpr, tpr, thresholds1 = roc_curve(y_true,y_pred)
    precision, recall, thresholds2 = precision_recall_curve(y_true,y_pred)
    roc_auc = auc(fpr, tpr)
    roc_aupr = auc(recall,precision) 
    if train == True:
        print('%s  (%s)' %(clf_name,"train"))
    else:
        print('%s  (%s)' %(clf_name,"test"))
    show_roc(roc_auc,fpr,tpr)
    print()
    show_roc_pr(roc_aupr,recall,precision)