scikit-learn中gridSearchCV的使用
阿新 • • 發佈:2018-12-20
步驟:
1.選擇並構建訓練模型model
2.將訓練模型model投入到GridSearchCV中,得到GridSearchCV模型grid_model
3.用grid_model擬合訓練集資料,選擇在validation_dataset上效果最好的引數的模型best_estimator
4.1.用best_estimator擬合訓練集(得到的結果應該與之前不同,因為之前用交叉驗證等方法對訓練集進行了分割)
4.2.用best_estimator擬合測試集
5.結果視覺化:AUC曲線,AUPR曲線
一.資料
【資料準備】
Size | Size | |
訓練集 | (1206, 294) | (1206,) |
測試集 | (64, 294) | (64,) |
二.主模型
【搭建環境】
from sklearn.neural_network import MLPClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.gaussian_process.kernels import RBF from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.model_selection import GridSearchCV,cross_val_score from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score from sklearn.metrics import auc,roc_auc_score,roc_curve,precision_recall_curve from sklearn.metrics import confusion_matrix,make_scorer
【模型準備】
seed = 1231 np.random.seed(seed) x_train,y_train,x_test,y_test = x_train,y_train,x_test,y_test names = ['Decision Tree', 'Random Forest'] classifiers = [DecisionTreeClassifier(),RandomForestClassifier()] parameter_dtc = {'max_features':['auto','sqrt','log2',None],'max_depth':range(3,100,2)} parameter_rfc = {'n_estimators':range(5,200,5),'max_features':['auto','sqrt','log2',None],'max_depth':range(3,100,2)} parameters = [parameter_dtc,parameter_rfc,parameter_ada_dtc,parameter_mlp] scoring = {'roc_auc':'roc_auc','accuracy':'accuracy', 'precision':'precision','recall':'recall','f1':'f1'}
【主模型函式】
def gird_search_model(clf,param,name,x_train,y_train,x_test,y_test): #clf-classifier;param-parameter;name-classifier_name
model = GridSearchCV(clf,param,cv=5,verbose=2,scoring=scoring,refit='roc_auc',n_jobs=-1,return_train_score=True) #GridSearchCV模型
fit = model.fit(x_train,y_train) #GridSearchCV模型擬合訓練集資料,並返回訓練器集合為fit
y_train_pred = fit.best_estimator_.predict(x_train) #用訓練器集合中最好的estimator預測y_train_pred
y_test_pred = fit.best_estimator_.predict(x_test) #用訓練器集合中最好的estimator預測y_test_pred
cv_results = pd.DataFrame(fit.cv_results_).set_index(['params']) #將訓練器集合fit的cv_results儲存為df格式
cv_results_mean = cv_results[['mean_train_accuracy', 'mean_train_f1','mean_train_precision', 'mean_train_recall', 'mean_train_roc_auc',
'mean_test_accuracy','mean_test_f1', 'mean_test_precision', 'mean_test_recall','mean_test_roc_auc']] #cv_results中的各個score的mean
cv_results_std = cv_results[['std_train_accuracy', 'std_train_f1', 'std_train_precision','std_train_recall', 'std_train_roc_auc',
'std_test_accuracy', 'std_test_f1','std_test_precision', 'std_test_recall', 'std_test_roc_auc']] #cv_results中的各個score的std
#-------------------模型結果展示------------------------------------------------------
print('MODEL : %r' % name)
print('Best cv_test_roc_auc: %f using %s' % (fit.best_score_,fit.best_params_)) #訓練器集合fit中最好的模型得到的:best_score和best_params
print(cv_results_mean)
print(cv_results_std)
train_score_list = []
test_score_list = []
score_list = []
model_metrics_name = [accuracy_score,precision_score,recall_score,f1_score,roc_auc_score,aupr] #模型評價指標,與scoreing相對應
for matrix in model_metrics_name: #計算各個模型評價指標
train_score = matrix(y_train,y_train_pred) #計算訓練集的
test_score = matrix(y_test,y_test_pred) #計算測試集的
train_score_list.append(train_score) #把訓練集的各個模型指標放在同一行
test_score_list.append(test_score) #把測試集的各個模型指標放在同一行
score_list.append(train_score_list) #合併訓練集和測試集的結果(便於展示)
score_list.append(test_score_list) #合併訓練集和測試集的結果(便於展示)
score_df = pd.DataFrame(score_list,index = ['train','test'],columns = ['accuracy','precision','recall','f1','roc_auc','aupr']) #將結果顯示為df格式,加上行列index
print('EVALUATE_METRICS:')
print(score_df)
return cv_results,score_list,y_train_pred,y_test_pred
【單個模型執行過程】
【單個模型執行結果】
【多個模型迴圈執行】
train_score_list = []
test_score_list = []
y_train_pred_list = []
y_test_pred_list = []
for clf,param,name in zip(classifiers,parameters,names):
cv_result,score_list,y_train_pred,y_test_pred = gird_search_model(clf,param,name,x_train,y_train,x_test,y_test) #執行主模型函式
train_score_list.append(score_list[0])
test_score_list.append(score_list[1])
y_train_pred_list.append(y_train_pred)
y_test_pred_list.append(y_test_pred)
print('-------------------------------------------------------------------------------------------------------------------------------')
train_score_df = pd.DataFrame(train_score_list,index=names,columns=['acc','pre','rec','f1','roc_auc','aupr'])
test_score_df = pd.DataFrame(test_score_list,index=names,columns=['acc','pre','rec','f1','roc_auc','aupr'])
print('TRAIN_SCORE:')
print(train_score_df)
print()
print('TEST_SCORE:')
print(test_score_df)
【多個模型執行結果】
三.畫AUC和PRC圖
【主函式】
for clf_name,y_train_pred,y_test_pred in zip(names,y_train_pred_list,y_test_pred_list):
show_curve(y_train,y_train_pred,clf_name,True)
show_curve(y_test,y_test_pred,clf_name,False)
【結果】
四.子函式(主程式內的,應該寫在最前面,本文為便於理解,放在最後)
1.模型評估函式裡有一個aupr(precision-recall-curve的曲線下面積):當正負樣本不平衡時使用aupr評估比auc好。
def aupr(y_true,y_pred):
precision, recall, thresholds = precision_recall_curve(y_true,y_pred)
roc_aupr = auc(recall,precision)
return roc_aupr
2.如果想使用混淆矩陣作為GridSearchCV模型中的scoring,需要用make_scorer轉換一下。
def tn(y_true,y_pred): return confusion_matrix(y_true,y_pred)[0,0]
def fp(y_true,y_pred): return confusion_matrix(y_true,y_pred)[0,1]
def fn(y_true,y_pred): return confusion_matrix(y_true,y_pred)[1,0]
def tp(y_true,y_pred): return confusion_matrix(y_true,y_pred)[1,1]
make_score = {'tp':make_scorer(tp),'tn':make_scorer(tn),'fp':make_scorer(fp),'fn':make_scorer(fn)}
3.畫圖_步1:AUC和PRC曲線
import matplotlib.pyplot as plt
def show_roc(roc_auc,fpr,tpr):
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--',label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot(fpr, tpr)
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()
def show_roc_pr(roc_aupr,recall,precision):
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--',label='ROC curve (area = %0.2f)' % roc_aupr)
plt.plot(recall, precision)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('ROC_PR curve')
plt.legend(loc='best')
plt.show()
print()
4.畫圖_步2:AUC和PRC曲線
def show_curve(y_true,y_pred,clf_name,train=True):
fpr, tpr, thresholds1 = roc_curve(y_true,y_pred)
precision, recall, thresholds2 = precision_recall_curve(y_true,y_pred)
roc_auc = auc(fpr, tpr)
roc_aupr = auc(recall,precision)
if train == True:
print('%s (%s)' %(clf_name,"train"))
else:
print('%s (%s)' %(clf_name,"test"))
show_roc(roc_auc,fpr,tpr)
print()
show_roc_pr(roc_aupr,recall,precision)