1. 程式人生 > >《機器學習實戰》學習總結2——決策樹構造

《機器學習實戰》學習總結2——決策樹構造

決策樹
一種依託於策略抉擇而建立起來的樹。
從資料產生決策樹的機器學習技術叫做決策樹學習。

資料形式:決策過程只有:是/否
適用資料型別:數值型和標稱型
標稱型:其實就是離散型資料,變數的結果只在有限目標集中取值。

資訊增益

資訊熵:
表示資訊的混亂程度,也就是說:資訊越有序,資訊熵越低。
資訊增益:
資訊增益越大,做的東西越好——為了找劃分資料集的最好特徵
劃分資料集的最大原則是:將無序的資料變得更加有序。

from math import log
import operator

def createDataSet():
	#每一行代表不同的資料,一共有五個資料
    dataSet =
[[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']] #第一個1代表是否露出水面,第二個1代表是否露出腳蹼,第三個是結果yes/no(是否是魚類) labels = ['no surfacing', 'flippers'] #change to discrete values return dataSet, labels#第一個是資料集,第二是描述(標籤) def
calcShannonEnt(dataSet): numEntries = len(dataSet) labelCounts = {} for featVec in dataSet: #the the number of unique elements and their occurance currentLabel = featVec[-1]#最後一列遍歷,統計 if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 labelCounts[
currentLabel] += 1 shannonEnt = 0.0 for key in labelCounts: prob = float(labelCounts[key])/numEntries shannonEnt -= prob * log(prob, 2) #log base 2#夏農熵求出夏農值 return shannonEnt def splitDataSet(dataSet, axis, value): retDataSet = [] for featVec in dataSet: #axis列為value的資料集【該資料集需要排除axis列】 if featVec[axis] == value: reducedFeatVec = featVec[:axis] #chop out axis used for splitting reducedFeatVec.extend(featVec[axis+1:]) retDataSet.append(reducedFeatVec) return retDataSet def chooseBestFeatureToSplit(dataSet): #求第一行有多少列的Feature,(減去最後一列是label列) numFeatures = len(dataSet[0]) - 1 #the last column is used for the labels #label的資訊熵(代表整體資料集的資訊(混亂程度)) baseEntropy = calcShannonEnt(dataSet) bestInfoGain = 0.0; bestFeature = -1 for i in range(numFeatures): #iterate over all the features #獲取每一個feature的list集合 featList = [example[i] for example in dataSet]#create a list of all the examples of this feature #獲取剔重後的集合 uniqueVals = set(featList) #get a set of unique values #建立一個臨時的資訊熵 newEntropy = 0.0 for value in uniqueVals: subDataSet = splitDataSet(dataSet, i, value) #value代表label(a,b,c,d),i代表列 #計算概率 prob = len(subDataSet)/float(len(dataSet)) #計算資訊熵 newEntropy += prob * calcShannonEnt(subDataSet) infoGain = baseEntropy - newEntropy #calculate the info gain; ie reduction in entropy #gain【資訊增益】:劃分資料集前後的資訊變化,獲取資訊熵最大的值 #劃分越有序就作為那個根 if (infoGain > bestInfoGain): #compare this to the best gain so far bestInfoGain = infoGain #if better than current best, set to best bestFeature = i return bestFeature #returns an integer def majorityCnt(classList): classCount={}#字典宣告 for vote in classList: if vote not in classCount.keys(): classCount[vote] = 0 classCount[vote] += 1 sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) #reverse=True倒序 return sortedClassCount[0][0] def createTree(dataSet, labels): classList = [example[-1] for example in dataSet] #取分類標籤 if classList.count(classList[0]) == len(classList): #如果類別完全相同則停止繼續劃分 return classList[0] if len(dataSet[0]) == 1: #遍歷完所有特徵時返回出現次數最多的類標籤 return majorityCnt(classList) bestFeat = chooseBestFeatureToSplit(dataSet) #選擇最優特徵 bestFeatLabel = labels[bestFeat] #最優特徵的標籤 myTree = {bestFeatLabel:{}} #根據最優特徵的標籤生成樹 del(labels[bestFeat]) #刪除已經使用特徵標籤 featValues =[example[bestFeat] for example in dataSet] #得到訓練集中所有最優特徵的屬性值 uniqueVals = set(featValues) #去掉重複的屬性值 for value in uniqueVals: #遍歷特徵,建立決策樹。 subLabels = labels[:] myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels) return myTree def classify(inputTree, featLabels, testVec): ''' inputTree:決策樹模型 featLabels:Feature標籤對應的名稱 testVec:測試輸入的資料 返回值:classlabel分類的結果,需要對映到label才能知道名稱 ''' #獲取tree的根節點對應的key值 firstStr = list(inputTree)[0] #通過key得到根節點對應的value secondDict = inputTree[firstStr] #傳入featLabels的名稱,求出對應根的名稱。index名稱對應的座標 featIndex = featLabels.index(firstStr) key = testVec[featIndex] valueOfFeat = secondDict[key] if isinstance(valueOfFeat, dict): classLabel = classify(valueOfFeat, featLabels, testVec) else: classLabel = valueOfFeat return classLabel def storeTree(inputTree, filename): import pickle fw = open(filename, 'wb') pickle.dump(inputTree, fw) fw.close() def grabTree(filename): import pickle fr = open(filename, 'rb') return pickle.load(fr)

總結:

1.extend與append的區別

>>> A = ['q', 'w', 'e', 'r']
>>> A.extend(['t', 'y'])
>>> A
['q', 'w', 'e', 'r', 't', 'y']
>>>len(A)
6

>>> B = ['q', 'w', 'e', 'r']
>>> B.append(['t', 'y'])
>>> B
['q', 'w', 'e', 'r', ['t', 'y']]
>>>len(B)
5
使用文字註解繪製樹節點
import matplotlib.pyplot as plt

decisionNode = dict(boxstyle="sawtooth", fc='0.8')  #設定節點格式
leafNode = dict(boxstyle="round4", fc='0.8')   #設定葉節點格式
arrow_args = dict(arrowstyle="<-")   #定義箭頭格式

def plotNode(nodeTxt, centerPt, parentPt, nodeType):
    createPlot.ax1.annotate(nodeTxt, xy=parentPt,xycoords='axes fraction',
                            xytext=centerPt, textcoords='axes fraction',va='center',
                            ha='center',bbox=nodeType,arrowprops=arrow_args)   #繪製節點

def plotMidText(cntrPt, parentPt, txtString):    #計算標註位置
    xMid = (parentPt[0]-cntrPt[0])/2.0 + cntrPt[0]
    yMid = (parentPt[1]-cntrPt[1])/2.0 + cntrPt[1]
    createPlot.ax1.text(xMid,yMid, txtString)

def plotTree(myTree, parentPt, nodeTxt):
    numLeafs = getNumLeafs(myTree)   #獲取決策樹葉結點數目,決定了樹的寬度
    depth = getTreeDepth(myTree)  #獲取決策樹層數
    firstStr = next(iter(myTree)) 
    cntrPt = (plotTree.xOff +(1.0 + float(numLeafs))/2.0/plotTree.totalW,plotTree.yOff) #中心位置
    plotMidText(cntrPt, parentPt, nodeTxt) #標註有向邊屬性值
    plotNode(firstStr, cntrPt, parentPt, decisionNode) #繪製結點
    secondDict = myTree[firstStr]  #下一個字典,也就是繼續繪製子結點
    plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD #y偏移
    for key in secondDict.keys():
        if type(secondDict[key]).__name__=='dict':  #測試該結點是否為字典,如果不是字典,代表此結點為葉子結點
            plotTree(secondDict[key],cntrPt,str(key))  #不是葉結點,遞迴呼叫繼續繪製
        else:  #如果是葉結點,繪製葉結點,並標註有向邊屬性值 
            plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
            plotNode(secondDict[key],(plotTree.xOff,plotTree.yOff),cntrPt, leafNode)
            plotMidText((plotTree.xOff,plotTree.yOff),cntrPt, str(key))
    plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD
    
def createPlot(inTree):
    fig = plt.figure(1,facecolor='white')  #建立fig
    fig.clf()  #清空fig
    axprops = dict(xticks=[], yticks=[])
    createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)  #去掉x、y軸
    plotTree.totalW = float(getNumLeafs(inTree))  #獲取決策樹葉結點數目
    plotTree.totalD = float(getTreeDepth(inTree))  #獲取決策樹層數
    plotTree.xOff = -0.5/plotTree.totalW; plotTree.yOff = 1.0  #x偏移
    plotTree(inTree,(0.5,1.0),'')    #繪製決策樹
    plt.show() #顯示繪製結果
使用pickle模組儲存決策樹
def storeTree(inputTree, filename):
    import pickle
    fw = open(filename,'w')
    pickle.dump(inputTree,fw)
    fw.close()

def grabTree(filename):
    import pickle
    fr = open(filename)
    return pickle.load(fr)
fr=open('lenses.txt')
lenses = [inst.strip().split('\t') for inst in fr.readlines()]
lensesLabels = ['age','prescript','astigmatic','tearRate']
lensesTree = createTree(lenses,lensesLabels)
print(lensesTree)
createPlot(lensesTree)