1. 程式人生 > >機器學習實戰-3決策樹

機器學習實戰-3決策樹

  • 劃分依據
    決策樹的主要依據為資訊熵計算,資訊熵最大的最為分類依據

  • 流程
    建立資料集 –> 計算資訊熵,最大值作為結點,劃分子資料集 –> 遞迴尋找

  • 程式碼

from math import log
import operator
'''
機器學習實戰-第三章(決策樹)
'''

# 建立資料集
def createDataSet():
    dataset = [
        [1, 1, 'yes'],
        [1, 0, 'no'],
        [0, 1, 'no'],
        [0, 1, 'no']
    ]
    labels = ['good'
, 'bad'] return dataset, labels # 計算夏農熵 def calcShannonEnt(dataset): numEntries = len(dataset) labelsCount = {} # 字典相當於java中的map for featVec in dataset: currentLabel = featVec[-1] if currentLabel not in labelsCount.keys(): labelsCount[currentLabel] = 1 else
: labelsCount[currentLabel] += 1 shannonEnt = 0.0 for key in labelsCount: prop = labelsCount[key] / numEntries shannonEnt -= prop * log(prop, 2) return shannonEnt # 劃分資料集 # 篩選出第axis個特徵的值為value的項,同時刪除次特徵列 def splitDataSet(dataSet, axis, value): retDataSet = [] for
featureVec in dataSet: if featureVec[axis] == value: reducedFeatVec = featureVec[:axis] reducedFeatVec.extend(featureVec[axis + 1:]) retDataSet.append(reducedFeatVec) return retDataSet # 補充:python中,可變的為引用,需要建立副本如列表;不可變的為值傳遞,如元組 # append和extend的區別,append是將後面一個作為整體一個加入,extend是將後面一個拆開,和之前的元組型別一樣的 # 計算每一個特徵值所對應的資訊熵,選出最大的資訊熵 def chooseBestFeatureToSplit(dataSet): numFeature = len(dataSet[0]) - 1 # 特徵數 bestInfoGain = 0.0; bestFeatureIndex = -1; # 最大的資訊增益和所在的特徵列,下標 # 分別對每一列特徵進行熵計算(i) for i in range(numFeature): featureList = [feature[0] for feature in dataSet] featureSet = set(featureList) # 將list轉化為set集合,提取出每一列的特徵項(不重複) for value in featureSet: subDataSet = splitDataSet(dataSet, i, value) prop = len(subDataSet) / float(len(dataSet)) # 百分比 infoGain = 0.0 - prop * calcShannonEnt(subDataSet) if infoGain > bestFeatureIndex: bestFeatureIndex = infoGain bestFeatureIndex = i return bestFeatureIndex # 構建決策樹 def createTree(dataSet, labels): classList = [oneData[-1] for oneData in dataSet] # 類別全部相同,就不用分(即label都相同) if classList.count(classList[0]) == len(dataSet): return classList[0] # 由於可能存在沒有屬性的情況,最後還有幾個不能分,此時,可以考慮將數量多的作為最終的結果。 if len(dataSet[0]) == 1: return majorityCnt(classList) bestFeatureIndex = chooseBestFeatureToSplit(dataSet) bestFeatureLabel = labels[bestFeatureIndex] myTree = {bestFeatureLabel: {}} # 通過字典來構建決策樹 featureList = [feature[0] for feature in dataSet] featureSet = set(featureList) # 將list轉化為set集合,提取出每一列的特徵項(不重複) del(labels[bestFeatureIndex]) for value in featureSet: sublabels = labels[:] myTree[bestFeatureLabel][value] = createTree(splitDataSet (dataSet, bestFeatureIndex, value), sublabels) return myTree # 找出最多的項 def majorityCnt(classList): countList = {} for oneData in classList: if oneData not in countList.keys(): countList[oneData] = 0 countList[oneData] += 1 # 從大到小排序,並返回最大值 sortedList = sorted(countList.iteritems(),key=operator.itemgetter(1),reverse=True) return sortedList[0][0] dataSet,labels = createDataSet() myTree = createTree(dataSet,labels) print(myTree)

歡迎使用 {小書匠}(xiaoshujiang)編輯器,您可以通過==設定==裡的修改模板來改變新建文章的內容。