1. 程式人生 > >機器學習一個小目標——Task7

機器學習一個小目標——Task7

1. 任務

【任務六-模型融合】用你目前評分最高的模型作為基準模型,和其他模型進行stacking融合,得到最終模型及評分

2. Stacking融合

按照自己的理解
第一層:
使用交叉驗證的劃分方法,將訓練集劃分成5份,
使用第一個基分類器對劃分之後得到的test進行預測,得到的5個predict檔案,維數 ( n 5 ,

1 ) (\frac{n}{5},1) ,縱向拼接得到1個Predict檔案維數 ( n , 1 )
(n,1)

使用第一個基分類器對整個Test進行預測,得到預測檔案5個p_t,維數 ( r , 1 ) (r,1)
,橫向拼接,求平均值得到1個Pt檔案維數 ( r , 1 ) (r,1)
使用第二個基分類器,
。。。
得到5個Predict檔案維數 ( n , 1 ) (n,1) ,5個Pt檔案維數 ( r , 1 ) (r,1)

在這裡插入圖片描述
第二層
將第一層得到的5個Predict檔案維數 ( n , 1 ) (n,1) 橫向拼接,再和訓練集拼接,得到新的訓練集Train,維數 ( n , m + 5 ) (n,m+5)
將第一層得到的5個t檔案維數 ( r , 1 ) (r,1) 橫向拼接,再和訓練集拼接,得到新的訓練集Test,維數 ( r , m + 4 ) (r,m+4)
對第二層的訓練集進行訓練,得到新的模型,
在這裡插入圖片描述
第三層:
使用行的模型對測試集進行預測
在這裡插入圖片描述

3. 實現程式碼

3.1 以下是按自己的理解寫的程式碼:

#!/usr/bin/env python 3.6
# -*- coding:utf-8 -*-
# @File    : CV1.py
# @Date    : 2018-11-22
# @Author  : 黑桃
# @Software: PyCharm 
from pandas import Series, DataFrame
import pickle

import pandas as pd
from sklearn.externals import joblib
from pandas import Series, DataFrame
from sklearn import svm
from sklearn.model_selection import *  # 劃分資料 交叉驗證
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, roc_curve
import warnings

warnings.filterwarnings("ignore")
path = "E:/MyPython/Machine_learning_GoGoGo/"
"""=====================================================================================================================
1 讀取資料
"""
print("0 讀取特徵")
f = open(path + 'feature/feature_V3.pkl', 'rb')
train, test, y_train, y_test = pickle.load(f)
f.close()

"""=====================================================================================================================
2 進行K次訓練;用K個模型分別對測試集進行預測,並得到K個結果,再進行結果的融合
"""


"""=====================================================================================================================
3  交叉驗證方式
"""
## 對交叉驗證方式進行指定,如驗證次數,訓練集測試集劃分比例等
kf = KFold(n_splits=5, random_state=1)
loo = LeaveOneOut()  # 將資料集分成訓練集和測試集,測試集包含一個樣本,訓練集包含n-1個樣本
lpo = LeavePOut(p=2000)  ## #將資料集分成訓練集和測試集,測試集包含p個樣本,訓練集包含n-p個樣本
ss = ShuffleSplit(n_splits=5, test_size=.25, random_state=0)
tss = TimeSeriesSplit(n_splits=5)

logo = LeaveOneGroupOut()
lpgo = LeavePGroupsOut(n_groups=3)
gss = GroupShuffleSplit(n_splits=4, test_size=.5, random_state=0)
gkf = GroupKFold(n_splits=2)
"""【配置交叉驗證方式】"""
cv = kf

"""=====================================================================================================================
2 讀取模型
"""
print("1 讀取模型")
SVM_linear = joblib.load( path + "model/model_file/SVM_linear.pkl")
SVM_poly = joblib.load( path + "model/model_file/SVM_poly.pkl")
SVM_rbf = joblib.load( path + "model/model_file/SVM_rbf.pkl")
SVM_sigmoid = joblib.load( path + "model/model_file/SVM_sigmoid.pkl")
lg_120 = joblib.load( path + "model/model_file/lg_120.pkl")
DT = joblib.load( path + "model/model_file/DT.pkl")
xgb_sklearn = joblib.load( path + "model/model_file/xgb_sklearn.pkl")
lgb_sklearn = joblib.load( path + "model/model_file/lgb_sklearn.pkl")
xgb = joblib.load( path + "model/model_file/xgb.pkl")
lgb = joblib.load( path + "model/model_file/lgb.pkl")

# 原始資料的索引不是從0開始的,因此重置索引
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
"""=====================================================================================================================
3 【第一層】用預測結果構建特徵
"""
def get_feature(clf,train,y_train,test,y_test,cv):
    preds_Train = []
    preds_Test = []
    i = 0
    score_sum = 0
    for train_idx, vali_idx in cv.split(train, y_train):
        i += 1
        """獲取訓練集和驗證集"""
        f_train_x = DataFrame(train[train_idx])
        f_train_y = DataFrame(y_train[train_idx])
        f_vali_x = DataFrame(train[vali_idx])
        f_vali_y = DataFrame(y_train[vali_idx])

        """訓練分類器"""

        clf.fit(f_train_x, f_train_y)

        """對測試集進行預測"""
        Train1 = clf.predict(f_vali_x)
        Test_i = clf.predict(test)

        preds_Test.append(Test_i)
        preds_Train.append(Train1)

        # """對驗證集進行預測,並計算f1分數"""
        # pre_vali = clf.predict(f_vali_x)
        # score_vali = f1_score(y_true=f_vali_y, y_pred=pre_vali, average='macro')
        # print("第{}折, 驗證集分數:{}".format(i, score_vali))
        # score_sum += score_vali
        # score_mean = score_sum / i
        # print("第{}折後, 驗證集分平均分數:{}".format(i, score_mean))

    preds_Train = DataFrame(preds_Train).T
    preds_Test = DataFrame(preds_Test).T
    Train_i = pd.concat(objs=[preds_Train[0], preds_Train[1], preds_Train[2], preds_Train[3], preds_Train[4]], axis=0, sort=True)
    Test_i = pd.concat(objs=[preds_Test[0], preds_Test[1], preds_Test[2], preds_Test[3], preds_Test[4]], axis=1)
    Test_i = Test_i.T.sum()/3
    return Test_i,Train_i

"""=====================================================================================================================
4 【第二層】特徵組合
"""
Test_1,Train1 = get_feature(SVM_linear,train,y_train,test,y_test,cv)
Test_2,Train2 = get_feature(lg_120,train,y_train,test,y_test,cv)
Test_3,Train3 = get_feature(DT,train,y_train,test,y_test,cv)
Test_4,Train4 = get_feature(SVM_rbf,train,y_train,test,y_test,cv)
Test_5,Train5 = get_feature(lgb_sklearn,train,y_train,test,y_test,cv)

Train = pd.concat(objs=[Train1, Train2, Train3, Train4, Train5], axis=1).reset_index(drop=True)
Test = pd.concat(objs=[Test_1, Test_2, Test_3, Test_4, Test_5], axis=1).astype(int).reset_index(drop=True)

train = DataFrame(train).reset_index(drop=True)
test = DataFrame(test).reset_index(drop=True)

Train = pd.concat(objs=[Train, train], axis=1)
Test = pd.concat(objs=[Test, test], axis=1)

"""=====================================================================================================================
【LGB_sklearn介面訓練】
"""
import lightgbm as lgbm
lgb_sklearn = lgbm.LGBMClassifier(learning_rate=0.1,
    max_bin=150,
    num_leaves=32,
    max_depth=11,
    reg_alpha=0.1,
    reg_lambda=0.2,
    # objective='multiclass',
    n_estimators=300,)
lgb_sklearn.fit(Train,y_train)
# y_lgb_pre = lgb_sklearn.predict(Test)
y_lgb_pre = lgb_sklearn.predict(Test)

print( "lgb_sklearn_Train_Score :{}".format(lgb_sklearn.score(Train, y_train)))
print("lgb_sklearn_Test_Score :{}".format(lgb_sklearn.score(Test, y_test)))
# print("lgb_sklearn_Train_AUC Score :{:.4f}".format(roc_auc_score(y_train, y_lgb_pre)))
print("lgb_sklearn_Test_AUC Score :{}".format(roc_auc_score(y_test, y_lgb_pre)))

3.2 調包實現的程式碼:

#!/usr/bin/env python 3.6
#-*- coding:utf-8 -*-
# @File    : Stacking2.py
# @Date    : 2018-11-25
# @Author  : 黑桃
# @Software: PyCharm 
from sklearn import datasets
import warnings
import pickle
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
import numpy as np
from sklearn.externals import joblib
warnings.filterwarnings("ignore")
iris = datasets.load_iris()
path = "E:/MyPython/Machine_learning_GoGoGo/"
"""=====================================================================================================================
1 讀取特徵
"""
print("0 讀取特徵")
f = open(path + 'feature/feature_V3.pkl', 'rb')
train, test, y_train,y_test= pickle.load(f)
f.close()
X, y = train, y_train

"""=====================================================================================================================
2 讀取模型
"""
print("1 讀取模型")
SVM_linear = joblib.load( path + "model/model_file/SVM_linear.pkl")
SVM_poly = joblib.load( path + "model/model_file/SVM_poly.pkl")
SVM_rbf = joblib.load( path + "model/model_file/SVM_rbf.pkl")
SVM_sigmoid = joblib.load( path + "model/model_file/SVM_sigmoid.pkl")
lg_120 = joblib.load( path + "model/model_file/lg_120.pkl")
DT = joblib.load( path + "model/model_file/DT.pkl")
xgb_sklearn = joblib.load( path + "model/model_file/xgb_sklearn.pkl")
lgb_sklearn = joblib.load( path + "model/model_file/lgb_sklearn.pkl")
xgb = joblib.load( path + "model/model_file/xgb.pkl")
lgb = joblib.load( path + "model/model_file/lgb.pkl")

clf1 =SVM_linear
clf2 = lg_120
clf3 = DT
clf4 = SVM_rbf
clf5 = lgb_sklearn
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3,clf4,clf5],meta_classifier=lgb_sklearn)

print('5-fold cross validation:\n')

for clf, label in zip([clf1, clf2, clf3,clf4,clf5 , sclf],
                      ['SVM_linear','lg_120','DT','SVM_rbf','lgb_sklearn','StackingClassifier']):
    scores = model_selection.cross_val_score(clf, X, y,cv=5, scoring='accuracy')
    print("Accuracy: %s (+/- %0.9f) [%s]"
          % (scores.mean(), scores.std(), label))


4. 實驗結果

自己的程式碼結果:

一級分類器 二級lgb Stacking之前 Stacking之後
SVM_linear、DT、SVM_rbf、lg_120、LGB_sklearn LGB_sklearn介面(predict) AUC Score 0.7951391197086869 0.78980256597753
SVM_linear、DT、SVM_rbf、lg_120、LGB_sklearn LGB_sklearn介面(proba) AUC Score 0.6481179876945349 0.6372122138757783

調包實現的結果:

Accuracy: 0.7845685143591137 (+/- 0.008116008) [SVM_linear]
Accuracy: 0.7946686730541058 (+/- 0.008620505) [lg_120]
Accuracy: 0.7671842760458581 (+/- 0.017846894) [DT]
Accuracy: 0.7514728483069482 (+/- 0.000409207) [SVM_rbf]
Accuracy: 0.7831687376559587 (+/- 0.012327716) [lgb_sklearn]
Accuracy: 0.7831687376559587 (+/- 0.012327716) [StackingClassifier]

5. 遇到的問題

  • 調包實現時,出現問題,五個一級分類器分別是,一級分類器 二級lgb Stacking之前 Stacking之後
    SVM_linear、DT、SVM_rbf、lg_120、LGB_sklearn,但是最終的StackingClassifier分數始終是LGB_sklearn的分數?

6. 參考資料

推薦|Kaggle機器學習之模型融合(stacking)心得
StackingClassifier