通過5折交叉驗證,實現邏輯迴歸,決策樹,SVM,隨機森林,GBDT,Xgboost,lightGBM的評分
阿新 • • 發佈:2018-12-22
通過5折交叉驗證,實現邏輯迴歸,決策樹,SVM,隨機森林,GBDT,Xgboost,lightGBM的評分
- 匯入的包
import pandas as pd import warnings from sklearn.preprocessing import scale from sklearn.model_selection import cross_val_score from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier from xgboost.sklearn import XGBClassifier import lightgbm as lgb
- 讀取資料集
data_all = pd.read_csv('/home/infisa/wjht/project/DataWhale/data_all.csv', encoding='gbk')
- 處理資料集
df_y=data_all['status']
df_X=data_all.drop(columns=['status'])
df_X=scale(df_X,axis=0) #將資料轉化為標準資料
- 構建模型
lr = LogisticRegression(random_state=2018,tol=1e-6) # 邏輯迴歸模型 tree = DecisionTreeClassifier(random_state=2018) #決策樹模型 svm = SVC(probability=True,random_state=2018,tol=1e-6) # SVM模型 forest=RandomForestClassifier(n_estimators=100,random_state=2018) # 隨機森林 Gbdt=GradientBoostingClassifier(random_state=2018) #CBDT Xgbc=XGBClassifier(random_state=2018) #Xgbc gbm=lgb.LGBMClassifier(random_state=2018) #lgb
- 構建評分函式,並採取5折交叉驗證的方式評分
def muti_score(model): warnings.filterwarnings('ignore') accuracy = cross_val_score(model, df_X, df_y, scoring='accuracy', cv=5) precision = cross_val_score(model, df_X, df_y, scoring='precision', cv=5) recall = cross_val_score(model, df_X, df_y, scoring='recall', cv=5) f1_score = cross_val_score(model, df_X, df_y, scoring='f1', cv=5) auc = cross_val_score(model, df_X, df_y, scoring='roc_auc', cv=5) print("準確率:",accuracy.mean()) print("精確率:",precision.mean()) print("召回率:",recall.mean()) print("F1_score:",f1_score.mean()) print("AUC:",auc.mean())
其中mean()指的是求得的均值
模型 | 準確率 | 精確率 | 召回率 | F1_score | AUC |
---|---|---|---|---|---|
邏輯迴歸 | 0.7890191148682617 | 0.6542724662896913 | 0.3377975457965613 | 0.44525012166067884 | 0.7840451024530857 |
決策樹 | 0.6962524533638791 | 0.6962524533638791 | 0.6962524533638791 | 0.6962524533638791 | 0.6962524533638791 |
SVM | 0.787758390223099 | 0.7351623295760905 | 0.24060335431243626 | 0.36179547264664874 | 0.7640376541388867 |
隨機森林 | 0.7921756804332226 | 0.7135700690071172 | 0.2867128441334693 | 0.40835414886475174 | 0.7752164698827589 |
GBDT | 0.7938590063951863 | 0.6604108594441386 | 0.36633732991104395 | 0.4708811551285791 | 0.7888240065764295 |
Xgboost | 0.7982740847293591 | 0.6829783239831001 | 0.3663162336064133 | 0.47673826685376613 | 0.7914190511145234 |
LightGbm | 0.79049080811139 | 0.6421783397519263 | 0.3730354066312717 | 0.47150438344663004 | 0.7776116341798183 |
-
分析
模型的評分思想,是通過採用5折交叉驗證,得出其中的均值分數來評判。從上表中可以看出邏輯迴歸,隨機森林,GBDT,Xgboost,LightGbm的各個指標都很相近而且分數也較高,說明這幾個模型擬合數據效果都較好,都可以選做模型。綜合來看Xgboost的分數更高一些,它的評分效果最好。 -
問題
01 還沒有學會在程式碼中,可以直接輸出表格的操作。
02 對各個模型的引數還不太瞭解
03 對資料集劃分,怎樣構造優質的資料還比較欠缺。 -
參考文章
cross_val_score的 scoring引數值解析
python機器學習庫sklearn——交叉驗證(K折、留一、留p、隨機)
12號同學寫的部落格 -
完整程式碼
import pandas as pd
import warnings
from sklearn.preprocessing import scale
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
import lightgbm as lgb
# 讀取資料集
data_all = pd.read_csv('/home/infisa/wjht/project/DataWhale/data_all.csv', encoding='gbk')
# 劃分為5折交叉驗證資料集
df_y=data_all['status']
df_X=data_all.drop(columns=['status'])
df_X=scale(df_X,axis=0) #將資料轉化為標準資料
#構建模型
lr = LogisticRegression(random_state=2018,tol=1e-6) # 邏輯迴歸模型
tree = DecisionTreeClassifier(random_state=2018) #決策樹模型
svm = SVC(probability=True,random_state=2018,tol=1e-6) # SVM模型
forest=RandomForestClassifier(n_estimators=100,random_state=2018) # 隨機森林
Gbdt=GradientBoostingClassifier(random_state=2018) #CBDT
Xgbc=XGBClassifier(random_state=2018) #Xgbc
gbm=lgb.LGBMClassifier(random_state=2018) #lgb
def muti_score(model):
warnings.filterwarnings('ignore')
accuracy = cross_val_score(model, df_X, df_y, scoring='accuracy', cv=5)
precision = cross_val_score(model, df_X, df_y, scoring='precision', cv=5)
recall = cross_val_score(model, df_X, df_y, scoring='recall', cv=5)
f1_score = cross_val_score(model, df_X, df_y, scoring='f1', cv=5)
auc = cross_val_score(model, df_X, df_y, scoring='roc_auc', cv=5)
print("準確率:",accuracy.mean())
print("精確率:",precision.mean())
print("召回率:",recall.mean())
print("F1_score:",f1_score.mean())
print("AUC:",auc.mean())
model_name=["lr","tree","svm","forest","Gbdt","Xgbc","gbm"]
for name in model_name:
model=eval(name)
print(name)
muti_score(model)
'''
lr
準確率: 0.7890191148682617
精確率: 0.6542724662896913
召回率: 0.3377975457965613
F1_score: 0.44525012166067884
AUC: 0.7840451024530857
tree
準確率: 0.6962524533638791
精確率: 0.39920670173446693
召回率: 0.4157413593052284
F1_score: 0.40705496051057793
AUC: 0.6029856787858856
svm
準確率: 0.787758390223099
精確率: 0.7351623295760905
召回率: 0.24060335431243626
F1_score: 0.36179547264664874
AUC: 0.7640376541388867
forest
準確率: 0.7921756804332226
精確率: 0.7135700690071172
召回率: 0.2867128441334693
F1_score: 0.40835414886475174
AUC: 0.7752164698827589
Gbdt
準確率: 0.7938590063951863
精確率: 0.6604108594441386
召回率: 0.36633732991104395
F1_score: 0.4708811551285791
AUC: 0.7888240065764295
Xgbc
準確率: 0.7982740847293591
精確率: 0.6829783239831001
召回率: 0.3663162336064133
F1_score: 0.47673826685376613
AUC: 0.7914190511145234
gbm
準確率: 0.79049080811139
精確率: 0.6421783397519263
召回率: 0.3730354066312717
F1_score: 0.47150438344663004
AUC: 0.7776116341798183
'''