【SciKit-Learn學習筆記】4：決策樹擬合泰坦尼克號資料集並提交到Kaggle

阿新 • • 發佈：2018-12-26

學習《scikit-learn機器學習》時的一些實踐。

決策樹擬合泰坦尼克號資料集

這裡用繪製引數-score曲線的方式去直觀看出模型引數對模型得分的影響，作者使用了GridSearchCV來自動做k-fold交叉驗證，並且能在多組模型引數中找到最優的一組和最優值（用平均score來評估）。

這種方式可以避免一次隨機劃分造成的不確定性太大，得到的曲線很不穩定。

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import 
 numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# 使matplotlib正常顯示負號
plt.rcParams['axes.unicode_minus'] = False

BASE_DIR = "E:/WorkSpace/ReadingNotes/scikit-learn機器學習/data/"

'''
泰坦尼克號資料集: https://www.kaggle.com/c/titanic/data
'''


# 讀取泰坦尼克號資料,並做一定的預處理 

def read_dataset(file):
    # index_col指定作為行索引的列,這裡第一列是PassengerId
    df = pd.read_csv(file, index_col=0)
    # 丟棄無用的特徵(指定axis=1即列),inplace=True則在df物件上操作,而不是返回操作後的df
    df.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)
    # 將性別轉換為男1女0:先轉換為True/False序列再進行型別轉換
    df['Sex'] = (df['Sex'] == 'male' 
).astype('int')
    # 將登船港口資料轉化為數值型資料,先獲得其中的所有可能取值放在列表中,再直接取其在列表中的下標即可
    embarked_unique = df['Embarked'].unique().tolist()
    df['Embarked'] = df['Embarked'].apply(lambda x: embarked_unique.index(x))
    # 缺失資料(NaN)設定為0
    df = df.fillna(0)
    return df


# 計算決策樹模型在指定引數下訓練集和驗證集上的得分
def get_score(X, y, **kwargs):
    clf = DecisionTreeClassifier(**kwargs)
    clf.fit(X, y)
    train_score = clf.score(X_train, y_train)
    cv_score = clf.score(X_cv, y_cv)
    return (train_score, cv_score)


# 尋找模型中引數的較優值,這裡尋找max_depth(決策樹前剪枝:最大深度)
def find_max_depth():
    global X_train, y_train
    depths = range(2, 15)
    scores = [get_score(X_train, y_train, max_depth=d) for d in depths]
    train_scores, cv_scores = [s[0] for s in scores], [s[1] for s in scores]
    # 找出cv_scores中數值最大的數字的下標,交叉驗證集中評分最高的對應的索引,這和depths中的索引相對應
    best_cv_score_index = np.argmax(cv_scores)
    # 從而找到最好的引數值和對應的驗證集評分
    print("最好的引數值:{},對應的驗證集評分:{}".format(depths[best_cv_score_index], cv_scores[best_cv_score_index]))
    # 繪製得分隨引數值變化的曲線
    plt.figure(figsize=(6, 4), dpi=144)
    plt.grid()
    plt.xlabel("決策樹的max_depth引數")
    plt.ylabel("score")
    plt.plot(depths, train_scores, '.r--', label="訓練集得分")
    plt.plot(depths, cv_scores, '.b--', label="驗證集得分")
    plt.legend()
    plt.show()


# 尋找min_impurity_decrease(決策樹前剪枝:資訊熵或基尼不純度的閾值)
def find_min_impurity_decrease():
    global X_train, y_train
    values = np.linspace(0, 0.5, 50)
    # 這裡criterion='gini'指定用基尼不純度作為衡量資訊不確定性的指標,即是CART
    scores = [get_score(X_train, y_train, criterion='gini', min_impurity_decrease=d) for d in values]
    train_scores, cv_scores = [s[0] for s in scores], [s[1] for s in scores]
    # 找出cv_scores中數值最大的數字的下標,交叉驗證集中評分最高的對應的索引,這和depths中的索引相對應
    best_cv_score_index = np.argmax(cv_scores)
    # 從而找到最好的引數值和對應的驗證集評分
    print("最好的引數值:{},對應的驗證集評分:{}".format(values[best_cv_score_index], cv_scores[best_cv_score_index]))
    # 繪製得分隨引數值變化的曲線
    plt.figure(figsize=(6, 4), dpi=144)
    plt.grid()
    plt.xlabel("決策樹的min_impurity_decrease引數")
    plt.ylabel("score")
    plt.plot(values, train_scores, '.r--', label="訓練集得分")
    plt.plot(values, cv_scores, '.b--', label="驗證集得分")
    plt.legend()
    plt.show()


# clf.cv_results儲存了計算過程的所有中間結果,用它來繪製score隨引數變化圖
def plot_curve(xs, cv_results, xlabel):
    train_score_mean = cv_results['mean_train_score']
    train_score_std = cv_results['std_train_score']
    test_score_mean = cv_results['mean_test_score']
    test_score_std = cv_results['std_test_score']
    plt.figure(figsize=(6, 4), dpi=144)
    plt.title("引數變化影響score")
    plt.grid()
    plt.xlabel(xlabel)
    plt.ylabel("得分")
    plt.fill_between(xs, train_score_mean - train_score_std, train_score_mean + train_score_std, alpha=0.1, color='r')
    plt.fill_between(xs, test_score_mean - test_score_std, test_score_mean + test_score_std, alpha=0.1, color='b')
    plt.plot(xs, train_score_mean, '.--', color='r', label='訓練集score平均值')
    plt.plot(xs, test_score_mean, '.--', color='b', label='cv集score平均值')
    plt.legend(loc='best')
    plt.show()


# 在多組引數中選擇最優的引數
def find_in_mix():
    global X, y
    entropy_thhs = np.linspace(0, 1, 50)
    gini_thhs = np.linspace(0, 0.5, 50)
    # 引數表
    param_grid = [
        {'criterion': ['entropy'], 'min_impurity_decrease': entropy_thhs, 'max_depth': range(2, 10),
         'min_samples_split': range(2, 30, 2)},
        {'criterion': ['gini'], 'min_impurity_decrease': gini_thhs, 'max_depth': range(2, 10),
         'min_samples_split': range(2, 30, 2)}
    ]
    # 將對引數表中每個字典中的多組引數進行組合,找到最優的一組
    clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
    clf.fit(X, y)
    print("最佳引數和引數值:{}\n最佳得分:{}".format(clf.best_params_, clf.best_score_))


if __name__ == '__main__':
    with open(BASE_DIR + "z7/train.csv") as f:
        df = read_dataset(f)
    # 標籤即"是否存活"一列
    y = df['Survived'].values
    # 特徵裡要去掉標籤這一列
    X = df.drop(['Survived'], axis=1).values
    # 劃分訓練集和驗證集
    X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.2)
    '''
    # 決策樹訓練和評估
    train_score, cv_score = get_score(X_train, y_train)
    print("訓練集得分:{},驗證集得分:{}".format(train_score, cv_score))
    # 尋找兩個預剪枝引數的較優值,這種方式多次執行不穩定,因為太隨機了,可以用後面的GridSearchCV做交叉驗證取平均
    find_max_depth()
    find_min_impurity_decrease()
    '''
    thresholds = np.linspace(0, 0.06, 50)  # 在觀察影象後調整範圍
    # 引數表
    param_grid = {'min_impurity_decrease': thresholds}
    # 列舉引數表中的所有值來構建模型,cv折交叉驗證,最終得到指定引數值的平均評分和標準差等
    # 注意,這裡指定了return_train_score=True才能用後面的clf.v_results_等
    clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, return_train_score=True)
    clf.fit(X, y)
    print("最佳引數和引數值:{}\n最佳得分:{}".format(clf.best_params_, clf.best_score_))
    plot_curve(thresholds, clf.cv_results_, "min_impurity_decrease")
    # 測試下多組混合
    print('-' * 20 + '多組混合' + '-' * 20)
    find_in_mix()

執行結果：

最佳引數和引數值:{'min_impurity_decrease': 0.006122448979591836}
最佳得分:0.8114478114478114

在這裡插入圖片描述

--------------------多組混合--------------------
最佳引數和引數值:{'criterion': 'entropy', 'max_depth': 8, 'min_impurity_decrease': 0.0, 'min_samples_split': 22}
最佳得分:0.819304152637486

第一次提交Kaggle

在Kaggle上可以看到有三個csv檔案，除了訓練集和測試集（不帶標籤）之外，還有一個就是提交的樣本。提交的時候是提交一個和提交樣本格式一樣的csv，伺服器會根據提交的結果和測試集實際的標籤進行對比，這個問題裡是用ACC來評分的。

前面得到了最佳（其實是在給定的引數內使得cv集的平均score最高）的引數，用它來對整個樣本集擬合，然後生成模型，對測試集進行預測，然後變成指定格式的csv檔案提交即可。

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import numpy as np

BASE_DIR = "E:/WorkSpace/ReadingNotes/scikit-learn機器學習/data/"


# 讀取泰坦尼克號資料,並做一定的預處理
def read_dataset(file):
    # index_col指定作為行索引的列,這裡第一列是PassengerId
    df = pd.read_csv(file, index_col=0)
    # 丟棄無用的特徵(指定axis=1即列),inplace=True則在df物件上操作,而不是返回操作後的df
    df.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)
    # 將性別轉換為男1女0:先轉換為True/False序列再進行型別轉換
    df['Sex'] = (df['Sex'] == 'male').astype('int')
    # 將登船港口資料轉化為數值型資料,先獲得其中的所有可能取值放在列表中,再直接取其在列表中的下標即可
    embarked_unique = df['Embarked'].unique().tolist()
    df['Embarked'] = df['Embarked'].apply(lambda x: embarked_unique.index(x))
    # 缺失資料(NaN)設定為0
    df = df.fillna(0)
    return df


if __name__ == '__main__':
    with open(BASE_DIR + "z7/train.csv") as f:
        df = read_dataset(f)
    # 標籤即"是否存活"一列
    y = df['Survived'].values
    # 特徵裡要去掉標籤這一列
    X = df.drop(['Survived'], axis=1).values
    clf = DecisionTreeClassifier(criterion='entropy', max_depth=8, min_impurity_decrease=0.0, min_samples_split=22)
    clf.fit(X, y)
    print(clf.score(X, y))
    # 讀入測試集並做預測,然後儲存結果
    with open(BASE_DIR + "z7/test.csv") as f2:
        test_df = read_dataset(f2)
    predictions = clf.predict(test_df.values)
    # 轉換成要求的格式
    result = pd.DataFrame({'PassengerId': test_df.index, 'Survived': predictions.astype(np.int32)})
    # print(result)
    result.to_csv("./titanic.csv", index=False)

【SciKit-Learn學習筆記】4：決策樹擬合泰坦尼克號資料集並提交到Kaggle

決策樹擬合泰坦尼克號資料集

第一次提交Kaggle

【SciKit-Learn學習筆記】4：決策樹擬合泰坦尼克號資料集並提交到Kaggle

【SciKit-Learn學習筆記】5：核SVM分類和預測乳腺癌資料集

【SciKit-Learn學習筆記】8：k-均值演算法做文字聚類,聚類演算法效能評估

【SciKit-Learn學習筆記】7：PCA結合SVM做AT&T資料集人物影象分類

【SciKit-Learn學習筆記】6：樸素貝葉斯做文件分類並繪製混淆矩陣

【SciKit-Learn學習筆記】3：線性迴歸測算波士頓房價,Logistic迴歸預測乳腺癌

【SciKit-Learn學習筆記】2：kNN分類/迴歸,在糖尿病資料集上的表現

【SciKit-Learn學習筆記】1：SVM預測digits資料集,繪製隨機波動樣本的學習曲線

機器學習之路: python 決策樹分類預測泰坦尼克號乘客是否幸存

【Vue.js學習筆記】4：事件修飾符,鍵盤事件,鍵值修飾符

Python機器學習入門1.8《使用整合模型預測泰坦尼克號乘客的生還情況預測》

【TensorFlow學習筆記】4：認識Variable及其重用(共享),在scope上的初始化

【Spring學習筆記】4：三種做屬性連線的Annotation,指示初始化和銷燬方法的Annotation

【Maven學習筆記】4：在IDEA中使用Maven搭建SSM空專案

【python學習筆記】46：隨機漫步,埃拉托色尼篩法,蒙特卡洛演算法,多項式迴歸

【Vue.js學習筆記】9：使用npm搭建Vue-CLI腳手架並建立Vue專案

【Vue.js學習筆記】8：建立多個Vue例項物件,認識Vue中的元件

【Vue.js學習筆記】7：v-for渲染,Vue的小Demo

【Vue.js學習筆記】6：動態繫結CSS樣式,條件渲染和v-show

【Vue.js學習筆記】5：雙向資料繫結,計算屬性

【SciKit-Learn學習筆記】4：決策樹擬合泰坦尼克號資料集並提交到Kaggle

決策樹擬合泰坦尼克號資料集

第一次提交Kaggle

相關推薦