1. 程式人生 > >機器學習之--決策樹遞歸算法實現

機器學習之--決策樹遞歸算法實現

決策 def 產生 直接 asi split classlist 好的 EDA

import numpy as np
import math

#產生數據的函數
def createdatabase():
    dataSet = [[1,1,yes],[1,1,yes],[1,0,no],[0,1,no],[0,1,no]]
    labels = [[no surfacing],[flippers]]
    return dataSet,labels
dataSet,labels = createdatabase()
print(dataSet:,dataSet)
print()

#求數據的香農熵                    熵越大 混合的數據越多
def XN(dataSet): mydict = {} Sum = 0 datasize = len(dataSet) for i in dataSet: mydict[i[-1]] = mydict.get(i[-1],0) + 1 for key in mydict: P = mydict[key] / datasize Sum -= P * math.log(P,2) # print(‘dict:‘,mydict) # print(‘XN:‘,Sum) return
Sum # XN(dataSet) def D_split(dataSet,axis,value): #按某一列的某個值分數據 以dataset[axis]的value值分類 返回的數組比原數據少一列(少的axis這列) # print("data:{},axis:{},value:{}".format(dataSet,axis,value)) result = [] for i in dataSet: # print(i[axis],i[axis] == value,type(i[axis])) if
i[axis] == value: # print(‘測試每行‘, i[axis]) data1 = i[:axis] data2 = i[axis + 1:] data1.extend(data2) result.append(data1) return result # partdata = D_split(dataSet,0,1) # partdata [[1, ‘yes‘], [1, ‘yes‘], [0, ‘yo‘]] def chooseaxis(dataSet): #選擇最佳axis 返回最佳特征值的序號 datasize = len(dataSet) #數據行數(長度) baseXN = XN(dataSet) #原始數據的熵值 bestaxis = 0 # 最好的axis選擇默認為0 for axis in range(len(dataSet[0]) - 1): #axis 為列號 value_list = [row[axis] for row in dataSet] #把該列的所有value組成一個列表 value_list = set(value_list) #去重 newXN = 0 #設置新熵值 for value in value_list: partdata = D_split(dataSet,axis,value) P = len(partdata) / datasize #求該value的概率 newXN += P * XN(partdata) # print(‘partdata:{},P:{},axis為:{},newXH:{}‘.format(partdata,P,axis,newXN)) # axis為:0,newXH:0.5509775004326937 # axis為:1,newXH:0.8 if newXN < baseXN: baseXN = newXN bestaxis = axis print(bestaxis:{},XN:{}.format(bestaxis,baseXN)) # bestaxis:0,XN:0.5509775004326937 return bestaxis def major(classlist): #少數服從多數函數 返回較多的類型 classcount = {} for i in classlist: classcount[i] = classcount.get(i,0) + 1 classcount = sorted(classcount,key=classcount.get) return classcount[-1] classlist = [i[2] for i in dataSet] print(classlist:,classlist) # mydict = major(classlist) def createtree(dataSet,labels): #構造樹 classlist = [i[-1] for i in dataSet] if len(dataSet) == classlist.count(classlist[0]): return classlist[0] if len(dataSet[0]) == 1: return major(classlist) axis = chooseaxis(dataSet) label_choose = labels[axis] # print(‘label_choose‘,label_choose) del labels[axis] mytree = {label_choose[0]:{}} #定義需要返回的樹 以當前分類特征為key for value in [row[axis] for row in dataSet]: newlables = labels[:] #如果直接傳lables,列表元素傳值是傳的引用,會影響,所以這裏用切片切個一樣的副本,不能單純的‘=‘,不然還是引用 mytree[label_choose[0]][value] = createtree(D_split(dataSet,axis,value),newlables) print("mytree:\n",mytree) return mytree createtree(dataSet,labels) # 結果如下: # {‘no surfacing‘: {1: {‘flippers‘: {1: ‘yes‘, 0: ‘no‘}}, 0: ‘no‘}}

機器學習之--決策樹遞歸算法實現