1. 程式人生 > >【機器學習】決策樹(基於ID3,C4.5,CART分類迴歸樹演算法)—— python3 實現方案

【機器學習】決策樹(基於ID3,C4.5,CART分類迴歸樹演算法)—— python3 實現方案

內含3種演算法的核心部分.

沒有找到很好的測試資料.

但就理清演算法思路來說問題不大

剪枝演算法目前只實現了CART迴歸樹的後剪枝.

import numpy as np
from collections import Counter
from sklearn import datasets


class DecisionTree:
    def __init__(self, algorithm='ID3'):
        """選擇誰用的演算法,可選的有ID3,C4.5,CARTcla(CART分類樹),CARTreg(CART迴歸樹)"""
        self.algorithm = algorithm

    @staticmethod
    def cal_entroy(dataset):
        """
        計算資料集的經驗熵,資料集為np.array
        :param dataset: 資料集m*n,m為樣本數,n為特徵數
        :return: 資料集的經驗熵
        """
        m = dataset.shape[0]  # 樣本數
        labels = Counter(dataset[:, -1].reshape(m).tolist())  # 獲取類別及其出現的次數
        entroy = 0  # 初始化經驗熵
        for amount in labels.values():
            prob = amount / m  # 計算概率pi
            entroy -= prob * np.log(prob)  # e=-sum(pi*log(pi))
        return entroy

    @staticmethod
    def cal_gini(dataset):
        """
        計算資料集的基尼指數,資料集為np.array
        :param dataset: 資料集m*n,m為樣本數,n為特徵數
        :return: 資料集的基尼指數
        """
        m = dataset.shape[0]
        labels = Counter(dataset[:, -1].reshape(m).tolist())
        gini = 1
        for amount in labels.values():
            prob = amount / m
            gini -= prob**2  # g=1-sum(pi**2)
        return gini

    @staticmethod
    def cal_se(dataset):
        """
        計算資料集的方差squared error,資料集為np.array
        np.var可直接計算出均方差,乘以樣本數即為方差
        :param dataset: 資料集m*n,m為樣本數,n為特徵數
        :return: 資料集的方差
        """
        return np.var(dataset[:, -1]) * dataset.shape[0] if dataset.shape[0] > 0 else 0

    def split_dataset(self, dataset, feature, value):
        """
        根據特徵feature的特徵值value,劃分資料集
        :param dataset: 資料集m*(n+1),m為樣本數,n為特徵數
        :param feature: 作為劃分點的特徵的索引
        :param value: 特徵的某一個值
        :return: dataset[feature]==value的資料集,且不再包含feature特徵
        """
        m, n = dataset.shape[0], dataset.shape[1] - 1
        if self.algorithm == 'ID3' or self.algorithm == 'C4.5':  # 獲取所有特徵值等於給定值的樣本D,返回去掉該特徵列的D.
            split_data = np.zeros((1, n))  # 初始化一個1*n的二維陣列,便於使用np.concatenate來增添資料,最後輸出結果時再去掉第一行就OK.
            for i in range(m):
                if dataset[i, feature] == value:
                    temp = np.concatenate((dataset[i, : feature], dataset[i, feature + 1:])).reshape(1, n)
                    split_data = np.concatenate((split_data, temp))
            return split_data[1:, :]
        else:  # 獲取符合條件的樣本,用於CART
            if self.algorithm == 'CARTcla':  # CART分類樹,訓練資料為離散型
                left = dataset[np.nonzero(dataset[:, feature] == value)[0], :]
                right = dataset[np.nonzero(dataset[:, feature] != value)[0], :]
            else:  # CART迴歸樹,訓練資料為連續型
                left = dataset[np.nonzero(dataset[:, feature] <= value)[0], :]
                right = dataset[np.nonzero(dataset[:, feature] > value)[0], :]
            return left, right

    def cal_entroy_gain(self, base_ent, dataset, feature):
        """
        計算資訊增益,用於ID3
        :param base_ent: 原資料的經驗熵
        :param dataset: 資料集m*(n+1),m為樣本數,n為特徵數
        :param feature: 作為劃分點的特徵的索引
        :return: 按照指定特徵劃分後的資訊增益
        """
        new_ent = 0
        values = np.unique(dataset[:, feature])  # 獲取特徵值的取值範圍
        for value in values:
            new_ent += self.cal_entroy(self.split_dataset(dataset, feature, value))
        return base_ent - new_ent

    def cal_entroy_gain_rate(self, base_ent, dataset, feature):
        """
        計算資訊增益比,用於C4.5
        :param base_ent: 原資料的經驗熵
        :param dataset: 資料集m*(n+1),m為樣本數,n為特徵數
        :param feature: 作為劃分點的特徵的索引
        :return: 按照指定特徵劃分後的資訊增益比
        """
        new_ent, split_ent = 0, 0
        values = np.unique(dataset[:, feature])
        for value in values:
            split_data = self.split_dataset(dataset, feature, value)
            new_ent += self.cal_entroy(split_data)
            prob = split_data.shape[0] / dataset.shape[0]
            split_ent -= prob * np.log(prob)
        return (base_ent - new_ent) / split_ent

    def cal_split_gini(self, dataset, feature):
        """
        計算資料集按照某一特徵的值劃分後,可以取得的最小基尼指數,返回該基尼指數和對應的值. 用於CART分類樹
        :param dataset: 資料集m*(n+1),m為樣本數,n為特徵數
        :param feature: 作為劃分點的特徵的索引
        :return: 最小基尼指數與其對應的特徵值
        """
        values = np.unique(dataset[:, feature])
        min_gini, min_value = np.inf, 0
        for value in values:
            left, right = self.split_dataset(dataset, feature, value)
            new_gini = left.shape[0] / dataset.shape[0] * self.cal_gini(left) + right.shape[0] / dataset.shape[0] * \
                       self.cal_gini(right)
            if new_gini < min_gini:
                min_gini = new_gini
                min_value = value
        return min_gini, min_value

    def cal_split_se(self, dataset, feature):
        """
        計算資料集按照某一特徵的值劃分後,可以取得的最小方差,返回該方差和對應的值. 用於CART迴歸樹
        :param dataset: 資料集m*(n+1),m為樣本數,n為特徵數
        :param feature: 作為劃分點的特徵的索引
        :return: 最小基尼指數與其對應的特徵值
        """
        values = np.unique(dataset[:, feature])
        min_se, min_value = np.inf, 0
        for value in values:
            left, right = self.split_dataset(dataset, feature, value)
            new_se = self.cal_se(left) + self.cal_se(right)
            if new_se < min_se:
                min_se = new_se
                min_value = value
        return min_se, min_value

    def choose_best_feature(self, dataset):
        """
        根據各演算法的要求,選取對劃分資料效果最好的特徵.
        :param dataset: 資料集m*(n+1),m為樣本數,n為特徵數
        :return: 對於ID3和C.45,返回最佳特徵的索引值;對於CART迴歸樹和分類樹,返回最佳特徵的索引值和對應的特徵值
        """
        m, n = dataset.shape[0], dataset.shape[1] - 1
        base_ent = self.cal_entroy(dataset)
        delta_gini, delta_info = np.inf, -np.inf  # 前者用於CART,後者用於ID3和C.45
        best_feature, best_value = -1, 0  # 定義最佳特徵索引和特徵值
        for feature in range(n):
            if self.algorithm == 'ID3':
                newdelta_info = self.cal_entroy_gain(base_ent, dataset, feature)
                if newdelta_info > delta_info:
                    best_feature = feature
                    delta_info = newdelta_info
            elif self.algorithm == 'C4.5':
                newdelta_info = self.cal_entroy_gain_rate(base_ent, dataset, feature)
                if newdelta_info > delta_info:
                    best_feature = feature
                    delta_info = newdelta_info
            elif self.algorithm == 'CARTcla':
                new_gini, value = self.cal_split_gini(dataset, feature)
                if new_gini < delta_gini:
                    delta_gini = new_gini
                    best_value = value
                    best_feature = feature
            else:  # CART迴歸樹
                new_se, value = self.cal_split_se(dataset, feature)
                if new_se < delta_gini:
                    delta_gini = new_se
                    best_value = value
                    best_feature = feature
        if self.algorithm == 'ID3' or self.algorithm == 'C4.5':
            return best_feature
        else:
            return best_feature, best_value

    def training(self, dataset, feature_label=None):
        """
        訓練模型,即生成決策樹的函式.利用字典來作為樹的資料結構.ID3和C4.5是N叉樹,CART是二叉樹
        :param dataset: 資料集m*(n+1),m為樣本數,n為特徵數
        :param feature_label: 索引值對應的含義列表,若沒有給定,則用初始資料的索引值代替.
        :return: 字典形式的決策樹
        """
        dataset = np.array(dataset)
        targets = dataset[:, -1]
        if np.unique(targets).shape[0] == 1:  # 即標籤列表中只有一個類別,返回此類別
            return targets[0]
        if dataset.shape[1] == 1:  # 對應 沒有特徵值可分的情況
            return Counter(targets.tolist()).most_common(1)[0]
        if feature_label is None:  # 若沒有給定對照表,則用初始資料的索引值代替.
            feature_label = [i for i in range(dataset.shape[1] - 1)]  

        if self.algorithm == 'ID3' or self.algorithm == 'C4.5':
            best_feature = self.choose_best_feature(dataset)  # 選取最佳分類特徵索引值
            best_feature_label = feature_label[best_feature]  # 獲取其含義
            feature_label_copy = feature_label.copy()  # 避免對源資料的修改
            feature_label_copy.pop(best_feature)  # 因為這個表要傳遞給子樹使用,所以刪去表中的這個元素(不然會導致索引值混亂,從而無法對應正確的特徵)
            mytree = {best_feature_label: {}}  # 建立根節點
            values = np.unique(dataset[:, best_feature])
            for value in values:  # 針對最佳分類特徵的每一個屬性值,建立子樹
                sublabel = feature_label_copy[:]  # 更新 子 特徵-含義 列表
                mytree[best_feature_label][value] = self.training(self.split_dataset(dataset, best_feature, value), 
                                                                  sublabel)
        else:
            best_feature, best_value = self.choose_best_feature(dataset)
            best_feature_label = feature_label[best_feature]
            mytree = dict()
            mytree['FeatLabel'] = best_feature_label  # 記錄結點選擇的特徵
            mytree['FeatValue'] = best_value  # 記錄結點選擇的特徵的值
            l_set, r_set = self.split_dataset(dataset, best_feature, best_value)
            mytree['left'] = self.training(l_set, feature_label)  # 構建左子樹
            mytree['right'] = self.training(r_set, feature_label)  # 構建右子樹
        return mytree

    def predict(self, tree, test_data, feature_label=None):
        """
        使用訓練好的決策樹,對單個待測樣本進行預測.如果要預測一個數據集,可以把資料集拆開來一個一個的進行預測再組合起來.
        :param tree: 訓練好的決策樹
        :param test_data: 待測樣本1*n
        :param feature_label: 索引值對應的含義列表,若沒有給定,則用初始資料的索引值代替.
        :return: 預測結果
        """
        if not isinstance(tree, dict):  # 終止條件,意味著到達葉子結點,返回葉子結點的值
            return tree
        if feature_label is None: 
            feature_label = [i for i in range(test_data.shape[1] - 1)]
        if self.algorithm == 'ID3' or self.algorithm == 'C4.5':
            best_feature_label = list(tree.keys())[0]  # 獲取特徵-含義對照表的值
            best_feature = feature_label.index(best_feature_label)  # 獲取特徵的索引值
            sub_tree = tree[best_feature_label]  # 獲取子樹
            value_of_feat = sub_tree[test_data[best_feature]]  # 找到測試樣本相應特徵值對應的子樹,遍歷該子樹
            return self.predict(value_of_feat, test_data, feature_label)
        else:
            best_feature_label = tree['FeatLabel']
            best_feature = feature_label.index(best_feature_label)
            if self.algorithm == 'CARTcla':  # CART分類樹
                if test_data[best_feature] == tree['FeatValue']:
                    return self.predict(tree['left'], test_data, feature_label)
                else:
                    return self.predict(tree['right'], test_data, feature_label)
            else:  # CART迴歸樹
                if test_data[best_feature] <= tree['FeatValue']:
                    return self.predict(tree['left'], test_data, feature_label)
                else:
                    return self.predict(tree['right'], test_data, feature_label)

    def prune(self, tree, test_data):
        """
        利用測試集,對生成樹進行後剪枝(CART迴歸樹)
        :param tree: 訓練好的決策樹
        :param test_data: 測試集資料m*(n+1),帶標籤列
        :return: 剪枝後的決策樹
        """
        def istree(tr):  # 判斷是否為決策樹
            return isinstance(tr, dict)

        def getmean(tr):  # 返回決策樹所有葉子結點的均值
            if istree(tr['left']):
                tr['left'] = getmean(tr['left'])
            if istree(tr['right']):
                tr['right'] = getmean(tr['right'])
            return (tr['left'] + tr['right']) / 2

        left = right = None
        if self.algorithm == 'CARTreg':
            if not test_data:  # 如果測試集為空,則對決策樹做塌陷處理,返回樹的葉子結點的均值
                return getmean(tree)
            if istree(tree['left']) or istree(tree['right']):
                left, right = self.split_dataset(test_data, tree['FeatLabel'], tree['FeatValue'])
            if istree(tree['left']): 
                tree['left'] = self.prune(tree['left'], left)  # 遍歷左子樹
            if istree(tree['right']): 
                tree['right'] = self.prune(tree['right'], right)  # 遍歷右子樹
            if not istree(tree['left']) and not istree(tree['right']):  # 抵達葉子結點
                left, right = self.split_dataset(test_data, tree['FeatLabel'], tree['FeatValue'])
                error_nomerge = np.sum(np.power(left[:, -1] - tree['left'], 2)) + \
                                np.sum(np.power(right[:, -1] - tree['right'], 2))
                tree_mean = (tree['left'] + tree['right']) / 2
                error_merge = np.sum(np.power(test_data[:, -1] - tree_mean, 2))
                if error_merge <= error_nomerge:  # 比較合併後與合併前,測試資料的誤差,那個更小
                    return tree_mean
                else:
                    return tree
            return tree


def test():
    """使用sklearn的鳶尾花資料集和生成的迴歸資料集分別對分類模型和迴歸模型測試"""
    dataset1 = datasets.load_iris()
    dataset1 = np.concatenate((dataset1['data'], dataset1['target'].reshape(-1, 1)), axis=1)
    dataset2 = datasets.make_regression()
    dataset2 = np.concatenate((dataset2[0], dataset2[1].reshape(-1, 1)), axis=1)
    dt1 = DecisionTree(algorithm='ID3')
    dt2 = DecisionTree(algorithm='C4.5')
    dt3 = DecisionTree(algorithm='CARTcla')
    dt4 = DecisionTree(algorithm='CARTreg')
    print(dt1.training(dataset1))
    print(dt2.training(dataset1))
    print(dt3.training(dataset1))
    print(dt4.training(dataset2))


test()