西瓜書決策樹ID3演算法實現,離散屬性
阿新 • • 發佈:2018-11-27
from math import log import operator import csv def readDataset(filename): ''' 讀取資料 :param filename: 資料檔名,CSV格式 :return: 以列表形式返回資料列表和特徵列表 ''' with open(filename) as f: reader = csv.reader(f) header_row = next(reader) labels = header_row[1:7] dataset = [] for line in reader: tempVect = line[1:7] tempVect.append(line[7]) dataset.append(tempVect) return dataset, labels def infoEnt(dataset): ''' 計算資訊熵 :param dataset: 輸入資料集 :return: 返回資訊熵 ''' numdata = len(dataset) labels = {} for featVec in dataset: label = featVec[-1] if label not in labels.keys(): labels[label] = 0 labels[label] += 1 infoEnt = 0 for lab in labels.keys(): prop = float(labels[lab]) / numdata infoEnt -= (prop * log(prop, 2)) return infoEnt def splitDataset(dataset, axis, value): ''' 對某個特徵進行劃分後的資料集 :param dataset: 資料集 :param axis: 劃分屬性的下標 :param value: 劃分屬性值 :return: 返回剩餘資料集 ''' restDataset = [] for featVec in dataset: if featVec[axis] == value: restFeatVec = featVec[:axis] restFeatVec.extend(featVec[axis + 1:]) restDataset.append(restFeatVec) return restDataset def bestFeatureSplit(dataset): ''' 最優屬性劃分 :param dataset: 輸入需要劃分的資料集 :return: 返回最優劃分屬性的下標 ''' numFeature = len(dataset[0]) - 1 baseInfoEnt = infoEnt(dataset) bestInfoGain = 0 bestFeature = -1 for i in range(numFeature): featList = [example[i] for example in dataset] uniqueValue = set(featList) newEnt = 0 for value in uniqueValue: subDataset = splitDataset(dataset, i, value) prop = len(subDataset) / float(len(dataset)) newEnt += prop * infoEnt(subDataset) infoGain = baseInfoEnt - newEnt if (infoGain > bestInfoGain): bestInfoGain = infoGain bestFeature = i return bestFeature def majorClass(classList): ''' 對葉節點的分類結果進行劃分,按照數量大小 :param classList: 葉節點上的樣本數量 :return: 返回葉節點劃分結果 ''' classCount = {} for vote in classList: if vote not in classCount: classCount[vote] = 0 classCount[vote] += 1 sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) # 返回陣列 return sortedClassCount[0][0] def createTree(dataset, labels, datasetFull, labelsFull): ''' 遞迴建立決策樹 :param dataset: 資料集列表 :param labels: 標籤集列表 :param datasetFull: 資料集列表,再傳一次 :param labelsFull: 標籤集列表,再傳一次 :return: 返回決策樹字典 ''' classList = [example[-1] for example in dataset] if classList.count(classList[0]) == len(classList): return classList[0] if len(dataset[0]) == 1: return (majorClass(classList)) bestFeat = bestFeatureSplit(dataset) bestFeatLabel = labels[bestFeat] myTree = {bestFeatLabel: {}} del (labels[bestFeat]) featValues = [example[bestFeat] for example in dataset] uniqueVal = set(featValues) # 建立所有屬性標籤的所有值,以防漏掉某些取值 bestFeatIndex = labelsFull.index(bestFeatLabel) featValuesFull = [example[bestFeatIndex] for example in datasetFull] uniqueValFull = set(featValuesFull) if uniqueVal == uniqueValFull: for value in uniqueVal: subLabels = labels[:] # 遞歸回退過程需要繼續使用標籤,所以前行過程標籤副本 myTree[bestFeatLabel][value] = createTree(splitDataset(dataset, bestFeat, value),subLabels, datasetFull, labelsFull) else: for value in uniqueVal: subLabels = labels[:] # 遞歸回退過程需要繼續使用標籤,所以前行過程標籤副本 myTree[bestFeatLabel][value] = createTree(splitDataset(dataset, bestFeat, value), subLabels, datasetFull, labelsFull) uniqueValFull.remove(value) for value in uniqueValFull: myTree[bestFeatLabel][value] = majorClass(classList) return myTree if __name__ == '__main__': filename = 'C:\\Users\\14399\\Desktop\\西瓜2.0.csv' dataset, labels = readDataset(filename) datasetFull = dataset[:] labelsFull = labels[:] myTree = createTree(dataset, labels, datasetFull, labelsFull) print(myTree)
生成決策樹:{'紋理': {'稍糊': {'觸感': {'硬滑': '否', '軟粘': '是'}}, '模糊': '否', '清晰': {'根蒂': {'硬挺': '否', '稍蜷': {'色澤': {'烏黑': {'觸感': {'硬滑': '是', '軟粘': '否'}}, '青綠': '是', '淺白': '是'}}, '蜷縮': '是'}}}}
西瓜2.0資料集:連結:https://pan.baidu.com/s/12aVngexje2RdizgOg1Fr0A 提取碼:uywy
參考:https://blog.csdn.net/csqazwsxedc/article/details/65697652