1. 程式人生 > >通過5折交叉驗證,實現邏輯迴歸,決策樹,SVM,隨機森林,GBDT,Xgboost,lightGBM的評分

通過5折交叉驗證,實現邏輯迴歸,決策樹,SVM,隨機森林,GBDT,Xgboost,lightGBM的評分

通過5折交叉驗證,實現邏輯迴歸,決策樹,SVM,隨機森林,GBDT,Xgboost,lightGBM的評分

  • 匯入的包
import pandas as pd
import warnings
from sklearn.preprocessing import scale
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
import lightgbm as lgb
  • 讀取資料集
data_all = pd.read_csv('/home/infisa/wjht/project/DataWhale/data_all.csv', encoding='gbk')
  • 處理資料集
df_y=data_all['status']
df_X=data_all.drop(columns=['status'])
df_X=scale(df_X,axis=0)  #將資料轉化為標準資料
  • 構建模型
lr = LogisticRegression(random_state=2018,tol=1e-6)  # 邏輯迴歸模型

tree = DecisionTreeClassifier(random_state=2018) #決策樹模型

svm = SVC(probability=True,random_state=2018,tol=1e-6)  # SVM模型

forest=RandomForestClassifier(n_estimators=100,random_state=2018) # 隨機森林

Gbdt=GradientBoostingClassifier(random_state=2018) #CBDT

Xgbc=XGBClassifier(random_state=2018)  #Xgbc

gbm=lgb.LGBMClassifier(random_state=2018)  #lgb
  • 構建評分函式,並採取5折交叉驗證的方式評分
def muti_score(model):
    warnings.filterwarnings('ignore')
    accuracy = cross_val_score(model, df_X, df_y, scoring='accuracy', cv=5)
    precision = cross_val_score(model, df_X, df_y, scoring='precision', cv=5)
    recall = cross_val_score(model, df_X, df_y, scoring='recall', cv=5)
    f1_score = cross_val_score(model, df_X, df_y, scoring='f1', cv=5)
    auc = cross_val_score(model, df_X, df_y, scoring='roc_auc', cv=5)
    print("準確率:",accuracy.mean())
    print("精確率:",precision.mean())
    print("召回率:",recall.mean())
    print("F1_score:",f1_score.mean())
    print("AUC:",auc.mean())

其中mean()指的是求得的均值

模型 準確率 精確率 召回率 F1_score AUC
邏輯迴歸 0.7890191148682617 0.6542724662896913 0.3377975457965613 0.44525012166067884 0.7840451024530857
決策樹 0.6962524533638791 0.6962524533638791 0.6962524533638791 0.6962524533638791 0.6962524533638791
SVM 0.787758390223099 0.7351623295760905 0.24060335431243626 0.36179547264664874 0.7640376541388867
隨機森林 0.7921756804332226 0.7135700690071172 0.2867128441334693 0.40835414886475174 0.7752164698827589
GBDT 0.7938590063951863 0.6604108594441386 0.36633732991104395 0.4708811551285791 0.7888240065764295
Xgboost 0.7982740847293591 0.6829783239831001 0.3663162336064133 0.47673826685376613 0.7914190511145234
LightGbm 0.79049080811139 0.6421783397519263 0.3730354066312717 0.47150438344663004 0.7776116341798183
  • 分析
    模型的評分思想,是通過採用5折交叉驗證,得出其中的均值分數來評判。從上表中可以看出邏輯迴歸,隨機森林,GBDT,Xgboost,LightGbm的各個指標都很相近而且分數也較高,說明這幾個模型擬合數據效果都較好,都可以選做模型。綜合來看Xgboost的分數更高一些,它的評分效果最好。

  • 問題
    01 還沒有學會在程式碼中,可以直接輸出表格的操作。
    02 對各個模型的引數還不太瞭解
    03 對資料集劃分,怎樣構造優質的資料還比較欠缺。

  • 參考文章
    cross_val_score的 scoring引數值解析
    python機器學習庫sklearn——交叉驗證(K折、留一、留p、隨機)
    12號同學寫的部落格

  • 完整程式碼

import pandas as pd
import warnings
from sklearn.preprocessing import scale
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
import lightgbm as lgb


# 讀取資料集
data_all = pd.read_csv('/home/infisa/wjht/project/DataWhale/data_all.csv', encoding='gbk')

# 劃分為5折交叉驗證資料集
df_y=data_all['status']
df_X=data_all.drop(columns=['status'])
df_X=scale(df_X,axis=0)  #將資料轉化為標準資料
#構建模型

lr = LogisticRegression(random_state=2018,tol=1e-6)  # 邏輯迴歸模型

tree = DecisionTreeClassifier(random_state=2018) #決策樹模型

svm = SVC(probability=True,random_state=2018,tol=1e-6)  # SVM模型

forest=RandomForestClassifier(n_estimators=100,random_state=2018) # 隨機森林

Gbdt=GradientBoostingClassifier(random_state=2018) #CBDT

Xgbc=XGBClassifier(random_state=2018)  #Xgbc

gbm=lgb.LGBMClassifier(random_state=2018)  #lgb



def muti_score(model):
    warnings.filterwarnings('ignore')
    accuracy = cross_val_score(model, df_X, df_y, scoring='accuracy', cv=5)
    precision = cross_val_score(model, df_X, df_y, scoring='precision', cv=5)
    recall = cross_val_score(model, df_X, df_y, scoring='recall', cv=5)
    f1_score = cross_val_score(model, df_X, df_y, scoring='f1', cv=5)
    auc = cross_val_score(model, df_X, df_y, scoring='roc_auc', cv=5)
    print("準確率:",accuracy.mean())
    print("精確率:",precision.mean())
    print("召回率:",recall.mean())
    print("F1_score:",f1_score.mean())
    print("AUC:",auc.mean())



model_name=["lr","tree","svm","forest","Gbdt","Xgbc","gbm"]
for name in model_name:
    model=eval(name)
    print(name)
    muti_score(model)


'''
lr
準確率: 0.7890191148682617
精確率: 0.6542724662896913
召回率: 0.3377975457965613
F1_score: 0.44525012166067884
AUC: 0.7840451024530857
tree
準確率: 0.6962524533638791
精確率: 0.39920670173446693
召回率: 0.4157413593052284
F1_score: 0.40705496051057793
AUC: 0.6029856787858856
svm
準確率: 0.787758390223099
精確率: 0.7351623295760905
召回率: 0.24060335431243626
F1_score: 0.36179547264664874
AUC: 0.7640376541388867
forest
準確率: 0.7921756804332226
精確率: 0.7135700690071172
召回率: 0.2867128441334693
F1_score: 0.40835414886475174
AUC: 0.7752164698827589
Gbdt
準確率: 0.7938590063951863
精確率: 0.6604108594441386
召回率: 0.36633732991104395
F1_score: 0.4708811551285791
AUC: 0.7888240065764295
Xgbc
準確率: 0.7982740847293591
精確率: 0.6829783239831001
召回率: 0.3663162336064133
F1_score: 0.47673826685376613
AUC: 0.7914190511145234
gbm
準確率: 0.79049080811139
精確率: 0.6421783397519263
召回率: 0.3730354066312717
F1_score: 0.47150438344663004
AUC: 0.7776116341798183
'''