1. 程式人生 > >機器學習之迴歸決策樹DecisionTreeRegressor

機器學習之迴歸決策樹DecisionTreeRegressor

  • 機器學習之迴歸決策樹DecisionTreeRegressor
# -*- coding: utf-8 -*-
"""
Created on Fri Nov 23 20:00:23 2018

@author: muli
"""

import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn import cross_validation
import matplotlib.pyplot as plt


def creat_data(n):
    '''
    產生用於迴歸問題的資料集

    :param n:  資料集容量
    :return: 返回一個元組,
     元素依次為:訓練樣本集、測試樣本集、訓練樣本集對應的值、測試樣本集對應的值
    '''
    np.random.seed(0)
    # 產生 n*1 的矩陣,隨機值為[0,1)之間
    X = 5 * np.random.rand(n, 1)
    # 正弦值
    y = np.sin(X).ravel()
    noise_num=(int)(n/5)
    # 每第5個樣本,就在該樣本的值上新增噪音
    y[::5] += 3 * (0.5 - np.random.rand(noise_num)) 
    return cross_validation.train_test_split(X, y,
		test_size=0.25,random_state=1) # 拆分原始資料集為訓練集和測試集,其中測試集大小為元素資料集大小的 1/4
    

def test_DecisionTreeRegressor(*data):
    '''
    測試 DecisionTreeRegressor 的用法

    :param data: 可變引數。它是一個元組,這裡要求其元素依次為:訓練樣本集、測試樣本集、訓練樣本的值、測試樣本的值
    :return: None
    '''
    X_train,X_test,y_train,y_test=data
    regr = DecisionTreeRegressor()
    regr.fit(X_train, y_train)
    print("Training score:%f"%(regr.score(X_train,y_train)))
    print("Testing score:%f"%(regr.score(X_test,y_test)))
    ##繪圖
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    X = np.arange(0.0, 5.0, 0.05)[:, np.newaxis]
    # 看預測 迴歸樹的效果
    Y = regr.predict(X)
    ax.scatter(X_train, y_train, label="train sample",c='g')
    ax.scatter(X_test, y_test, label="test sample",c='r')
    ax.plot(X, Y, label="predict_value", linewidth=2,alpha=0.5)
    ax.set_xlabel("data")
    ax.set_ylabel("target")
    ax.set_title("Decision Tree Regression")
    ax.legend(framealpha=0.5)
    plt.show()
    
    
def test_DecisionTreeRegressor_splitter(*data):
    '''
    檢驗 隨機劃分與最優劃分的影響

    :param data: 可變引數。
    它是一個元組,這裡要求其元素依次為:訓練樣本集、測試樣本集、訓練樣本的值、測試樣本的值
    :return: None
    '''
    X_train,X_test,y_train,y_test=data
    splitters=['best','random']
    for splitter in splitters:
        regr = DecisionTreeRegressor(splitter=splitter)
        regr.fit(X_train, y_train)
        print("Splitter %s"%splitter)
        print("Training score:%f"%(regr.score(X_train,y_train)))
        print("Testing score:%f"%(regr.score(X_test,y_test)))
        print("----------------------")
    

def test_DecisionTreeRegressor_depth(*data,maxdepth):
    '''
    預測效能隨  max_depth 的影響

    :param data:  可變引數。它是一個元組,這裡要求其元素依次為:訓練樣本集、測試樣本集、訓練樣本的值、測試樣本的值
    :param maxdepth: 一個整數,它作為 DecisionTreeRegressor 的 max_depth 引數
    :return: None
    '''
    X_train,X_test,y_train,y_test=data
    # maxdepth表示最大深度
    depths=np.arange(1,maxdepth)
    training_scores=[]
    testing_scores=[]
    for depth in depths:
        regr = DecisionTreeRegressor(max_depth=depth)
        regr.fit(X_train, y_train)
        training_scores.append(regr.score(X_train,y_train))
        testing_scores.append(regr.score(X_test,y_test))

    ## 繪圖
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    ax.plot(depths,training_scores,label="traing score")
    ax.plot(depths,testing_scores,label="testing score")
    ax.set_xlabel("maxdepth")
    ax.set_ylabel("score")
    ax.set_title("Decision Tree Regression")
    ax.legend(framealpha=0.5)
    plt.show()

  
if __name__=='__main__':
    X_train,X_test,y_train,y_test=creat_data(100) # 產生用於迴歸問題的資料集
#    test_DecisionTreeRegressor(X_train,X_test,y_train,y_test) # 呼叫 test_DecisionTreeRegressor
#    test_DecisionTreeRegressor_splitter(X_train,X_test,y_train,y_test) # 呼叫 test_DecisionTreeRegressor_splitter
    test_DecisionTreeRegressor_depth(X_train,X_test,y_train,y_test,maxdepth=20) # 呼叫 test_DecisionTreeRegressor_depth