1. 程式人生 > >樹迴歸:CART演算法構建迴歸樹和模型樹(程式碼筆記)

樹迴歸:CART演算法構建迴歸樹和模型樹(程式碼筆記)

分類迴歸樹(Classification And Regression Trees,CART)是一種構造樹的監督學習方法。

和ID3決策樹作比較:

1. ID3每次直接用最佳特徵分割資料,即如果當前特徵有4個可能值,那麼資料將被分成4份,處理的是標稱型資料,不能直接處理連續型資料。CART則利用二元切分來處理連續型變數,每次會找一個最佳特徵的閾值,把資料集分成兩部分,也就是左子樹和右子樹。

2. CART使用方差計算來代替夏農熵。但目的都是找最佳切分特徵。

import numpy as np
'''
CART使用二元切分來處理連續型變數。
迴歸樹和分類樹類似,只是葉節點的資料型別是連續型不是離散型
(其實也不是真正的“連續”,切分時仍取決於屬性值,只不過數值都是浮點數)
以下是兩種CART:迴歸樹,模型樹
'''
def loadData(filename):
    dataM = []
    fr = open(filename)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        fltLine = map(float, curLine)  # 每行存成一組浮點數
        dataM.append(fltLine)
    return dataM

# ----------------- 迴歸樹(regression tree)每個葉節點包含單個值 -------------------
def regLeaf(data): # 資料不需要再切分時用來生成葉節點(常量)
    return np.mean(data[:,-1])

def regErr(data):  # 誤差用均方差計算
    return np.var(data[:,-1]) * np.shape(data)[0]

# 找最佳的切分的位置(特徵)和閾值
def chooseBestSplit(data, leafType=regLeaf, errType=regErr, ops=(1,4)):
    tolS = ops[0]  # 允許的誤差減少量的最低值
    tolN = ops[1]  # 允許切分的最少樣本數
    if len(set(data[:,-1].T.tolist()[0])) == 1:  # 標籤值只有一個值(都是一類)
        return None, leafType(data)
    m, n = np.shape(data)
    S = errType(data)  # 目標變數的總方差
    bestS = inf
    bestIdx = 0
    bestVal = 0
    for featIdx in range(n-1):
        for splitVal in set(data[:, featIdx]):
            mat0, mat1 = binSplitData(data, featIdx, splitVal)
            if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):
                continue    # 劃分條件
            newS = errType(mat0) + errType(mat1)
            if newS < bestS:
                bestIdx = featIdx
                bestVal = splitVal
                bestS = newS
    if (S-newS) < tolS:                        
        return None, leafType(data)  # 如果誤差變化量很小就退出
    mat0, mat1 = binSplitData(data, bestIdx, bestVal)
    if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):
        return None, leafType(data)  # 如果切分的資料集很小也退出
    return bestIdx, bestVal

# 資料集的切分函式
def binSplitData(data, feature, value):
    mat0 = data[np.nonzero(data[:, feature] > value)[0], :]  # 左子樹
    mat1 = data[np.nonzero(data[:, feature] <= value)[0], :] # 右子樹
    return mat0, mat1

def createTree(data, leafType=regLeaf, errType=regErr, ops=(1,4)):
    feat, val = chooseBestSplit(data, leafType, errType, ops)
    if feat == None:  # feat為None是chooseBestSplit()決定的不再切分資料
        return val    # val是leafType()生成的葉節點 (這裡是常值, 變數均值)
    retTree = {}
    retTree['spInd'] = feat
    retTree['spVal'] = val
    lfData, rtData = binSplitData(data, feat, val)
    retTree['left'] = createTree(lfData, leafType, errType, ops)
    retTree['right']= createTree(rtData, leafType, errType, ops)
    return retTree

# ------------------ 模型樹(model tree)每個葉節點包含一個線性方程 -------------------
def linearNode(data):
    m, n = np.shape(data)
    x = np.mat(np.ones((m,n)))
    y = np.mat(np.ones((m,1)))
    x[:, 1:n] = data[:, 0:n-1]
    y = data[:, -1]
    xTx = x.T * x
    if linalg.det(xTx) == 0.0:
        raise(NameError('This matrix is singular, cannot do inverse'))
    w = xTx.I * (x.T * y)
    return w, x, y

def modelLeaf(data):  # 資料不需要再切分時用來生成葉節點(線性函式) 
    w, x, y = linearNode(data)
    return w

def modelErr(data):   # 誤差用平方差計算
    w, x, y = linearNode(data)
    yHat = x * w
    return np.sum(np.power(y-yHat, 2))

def createTree(data, leafType=modelLeaf, errType=modelErr, ops=(1,4)):
    feat, val = chooseBestSplit(data, leafType, errType, ops)
    if feat == None:  # feat為None是chooseBestSplit()決定的不再切分資料
        return val    # val是leafType()生成的葉節點 (這裡是直線, 迴歸係數 )
    retTree = {}
    retTree['spInd'] = feat
    retTree['spVal'] = val
    lfData, rtData = binSplitData(data, feat, val)
    retTree['left'] = createTree(lfData, leafType, errType, ops)
    retTree['right']= createTree(rtData, leafType, errType, ops)
    return retTree

# ----------------------------- 迴歸樹和模型樹做預測 ----------------------------------
def regTreeEval(treeNode, xdata):   # 葉節點為常量值
    return float(treeNode)

def modelTreeEval(treeNode, xdata): # 葉節點為迴歸係數
    n = np.shape(xdata)[1]
    x = np.mat(np.ones((1, n+1)))
    x[:, 1:n+1] = xdata
    return float(x*treeNode)

def isTree(obj):
    return (type(obj).__name__ == 'dict')

# modelEval指定樹的型別,區分兩種葉節點
def treePredict(tree, xTest, modelEval=regTreeEval): 
    if not isTree(tree):
        return modelEval(tree, xTest)
    if xTest[tree['spInd']] > tree['spVal']:  # 劃分特徵的值大於閾值,分到左子樹
        if isTree(tree['left']):                       # 左子樹還有分支
            return treePredict(tree['left'], xTest, modelEval)
        else:                                          # 左子樹已經是葉節點
            return modelEval(tree['left'], xTest)
    else:                                     # 劃分特徵的值小於閾值,分到右子樹
        if isTree(tree['right']):
            return treePredict(tree['right'], xTest, modelEval)
        else:
            return modelEval(tree['right'], xTest)