1. 程式人生 > >西瓜書決策樹ID3演算法實現,離散屬性

西瓜書決策樹ID3演算法實現,離散屬性

from math import log
import operator
import csv


def readDataset(filename):
    '''
    讀取資料
    :param filename: 資料檔名,CSV格式
    :return:  以列表形式返回資料列表和特徵列表
    '''
    with open(filename) as f:
        reader = csv.reader(f)
        header_row = next(reader)
        labels = header_row[1:7]
        dataset = []
        for line in reader:
            tempVect = line[1:7]
            tempVect.append(line[7])
            dataset.append(tempVect)
    return dataset, labels


def infoEnt(dataset):
    '''
    計算資訊熵
    :param dataset:  輸入資料集
    :return:  返回資訊熵
    '''
    numdata = len(dataset)
    labels = {}
    for featVec in dataset:
        label = featVec[-1]
        if label not in labels.keys():
            labels[label] = 0
        labels[label] += 1
    infoEnt = 0
    for lab in labels.keys():
        prop = float(labels[lab]) / numdata
        infoEnt -= (prop * log(prop, 2))
    return infoEnt


def splitDataset(dataset, axis, value):
    '''
    對某個特徵進行劃分後的資料集
    :param dataset: 資料集
    :param axis: 劃分屬性的下標
    :param value: 劃分屬性值
    :return: 返回剩餘資料集
    '''
    restDataset = []
    for featVec in dataset:
        if featVec[axis] == value:
            restFeatVec = featVec[:axis]
            restFeatVec.extend(featVec[axis + 1:])
            restDataset.append(restFeatVec)
    return restDataset


def bestFeatureSplit(dataset):
    '''
    最優屬性劃分
    :param dataset: 輸入需要劃分的資料集
    :return:  返回最優劃分屬性的下標
    '''
    numFeature = len(dataset[0]) - 1
    baseInfoEnt = infoEnt(dataset)
    bestInfoGain = 0
    bestFeature = -1
    for i in range(numFeature):
        featList = [example[i] for example in dataset]
        uniqueValue = set(featList)
        newEnt = 0
        for value in uniqueValue:
            subDataset = splitDataset(dataset, i, value)
            prop = len(subDataset) / float(len(dataset))
            newEnt += prop * infoEnt(subDataset)
        infoGain = baseInfoEnt - newEnt
        if (infoGain > bestInfoGain):
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature


def majorClass(classList):
    '''
    對葉節點的分類結果進行劃分,按照數量大小
    :param classList:  葉節點上的樣本數量
    :return: 返回葉節點劃分結果
    '''
    classCount = {}
    for vote in classList:
        if vote not in classCount:
            classCount[vote] = 0
        classCount[vote] += 1
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)  # 返回陣列
    return sortedClassCount[0][0]


def createTree(dataset, labels, datasetFull, labelsFull):
    '''
    遞迴建立決策樹
    :param dataset: 資料集列表
    :param labels:  標籤集列表
    :param datasetFull: 資料集列表,再傳一次
    :param labelsFull:  標籤集列表,再傳一次
    :return: 返回決策樹字典
    '''
    classList = [example[-1] for example in dataset]
    if classList.count(classList[0]) == len(classList):
        return classList[0]
    if len(dataset[0]) == 1:
        return (majorClass(classList))
    bestFeat = bestFeatureSplit(dataset)
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel: {}}
    del (labels[bestFeat])
    featValues = [example[bestFeat] for example in dataset]
    uniqueVal = set(featValues)
    # 建立所有屬性標籤的所有值,以防漏掉某些取值
    bestFeatIndex = labelsFull.index(bestFeatLabel)
    featValuesFull = [example[bestFeatIndex] for example in datasetFull]
    uniqueValFull = set(featValuesFull)
    if uniqueVal == uniqueValFull:
        for value in uniqueVal:
            subLabels = labels[:]  # 遞歸回退過程需要繼續使用標籤,所以前行過程標籤副本
            myTree[bestFeatLabel][value] = createTree(splitDataset(dataset, bestFeat,                 
                                             value),subLabels, datasetFull, labelsFull)
    else:
        for value in uniqueVal:
            subLabels = labels[:]  # 遞歸回退過程需要繼續使用標籤,所以前行過程標籤副本
            myTree[bestFeatLabel][value] = createTree(splitDataset(dataset, bestFeat, 
                                             value), subLabels, datasetFull, labelsFull)
            uniqueValFull.remove(value)
        for value in uniqueValFull:
            myTree[bestFeatLabel][value] = majorClass(classList)
    return myTree


if __name__ == '__main__':
    filename = 'C:\\Users\\14399\\Desktop\\西瓜2.0.csv'
    dataset, labels = readDataset(filename)
    datasetFull = dataset[:]
    labelsFull = labels[:]
    myTree = createTree(dataset, labels, datasetFull, labelsFull)
    print(myTree)

生成決策樹:{'紋理': {'稍糊': {'觸感': {'硬滑': '否', '軟粘': '是'}}, '模糊': '否', '清晰': {'根蒂': {'硬挺': '否', '稍蜷': {'色澤': {'烏黑': {'觸感': {'硬滑': '是', '軟粘': '否'}}, '青綠': '是', '淺白': '是'}}, '蜷縮': '是'}}}}

西瓜2.0資料集:連結:https://pan.baidu.com/s/12aVngexje2RdizgOg1Fr0A   提取碼:uywy 

參考:https://blog.csdn.net/csqazwsxedc/article/details/65697652