1. 程式人生 > >MachineLearning—CART分類迴歸樹python應用實現

MachineLearning—CART分類迴歸樹python應用實現

# -*- coding: utf-8 -*-  

from numpy import *  
import numpy as np  
import pandas as pd  
from math import log  
import operator  
import re 
from collections import defaultdict
import itertools

def calGini(dataSet):
    numEntries = len(dataSet)
    labelCounts={}
    for featVec in dataSet: 
        currentLabel = featVec[-1]
        if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    gini=1
    for label in labelCounts.keys():
        prop=float(labelCounts[label])/numEntries
        gini -=prop*prop
    return gini

# 傳入的是一個特徵值的列表,返回特徵值二分的結果
def featuresplit(features):
    count = len(features)#特徵值的個數
    if count < 2:        #特徵值只有一個值比如'cold_blood'
        li=[]
        print "please check sample's features,only one feature value"
        li.append(features)
        return tuple(li)     #列表轉化為元組
    
    # 由於需要返回二分結果,所以每個分支至少需要一個特徵值,所以要從所有的特徵組合中選取1個以上的組合
    # itertools的combinations 函式可以返回一個列表選多少個元素的組合結果,例如combinations(list,2)返回的列表元素選2個的組合
    # 我們需要選擇1-(count-1)的組合
    featureIndex = range(count)
    featureIndex.pop(0) 
    combinationsList = []    
    resList=[]
    # 遍歷所有的組合
    for i in featureIndex:
        temp_combination = list(itertools.combinations(features, len(features[0:i])))
        combinationsList.extend(temp_combination)
        combiLen = len(combinationsList)
    # 每次組合的順序都是一致的,並且也是對稱的,所以我們取首尾組合集合
    # zip函式提供了兩個列表對應位置組合的功能
    resList = zip(combinationsList[0:combiLen/2], combinationsList[combiLen-1:combiLen/2-1:-1])   #往回數間隔為1
    
    return resList       #二分特徵的不同情況

#def splitDataSet(dataSet, axis, values):
#    retDataSet = []
#    for featVec in dataSet:
#        for value in values:
#            if featVec[axis] == value:
#                reducedFeatVec = featVec[:axis]     #剔除樣本集
#                reducedFeatVec.extend(featVec[axis+1:])
#                retDataSet.append(reducedFeatVec)
#    return retDataSet   #把那些特徵值等於value的都剔出來

#def splitDataSet(dataSet, axis, values):     #實現了一些特徵的重複利用 比如cover   特徵複用
#    retDataSet = []
#    if len(values) < 2:
#        for featVec in dataSet:        #長度小於2即只有一個特徵值
#            if featVec[axis] == values[0]:   #如果特徵值只有一個,不抽取當選特徵
#                reducedFeatVec = featVec[:axis]     
#                reducedFeatVec.extend(featVec[axis+1:])
#                retDataSet.append(reducedFeatVec)
#    else:
#        for featVec in dataSet:
#            for value in values:
#                if featVec[axis] == value:   #如果特徵值多於一個,選取當前特徵
#                    retDataSet.append(featVec)
#
#    return retDataSet

#處理連續特徵值
def splitDataSet(dataSet, axis, value,threshold):
    retDataSet = []
    if threshold == 'lt':
        for featVec in dataSet:
            if featVec[axis] <= value:
                retDataSet.append(featVec)
    else:
        for featVec in dataSet:
            if featVec[axis] > value:
                retDataSet.append(featVec)

    return retDataSet


# 返回最好的特徵以及二分特徵值
"""def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1      #
    bestGiniGain = 1.0
    bestFeature = -1
    bestBinarySplit=()
    for i in range(numFeatures):        #遍歷特徵
        featList = [example[i] for example in dataSet]   #得到特徵列
        uniqueVals = list(set(featList))       #去除重複值的特徵列
        # 三個特徵值的二分結果:
        #   [(('young',), ('old', 'middle')), (('old',), ('young', 'middle')), (('middle',), ('young', 'old'))]
        for split in featuresplit(uniqueVals):   #featuresplit返回特徵的所有二分情況
            GiniGain = 0.0
            if len(split)==1:         #split是一個元組 特徵值只有一個比如:cold_blood 只有一個特徵值就沒辦法繼續劃分下去了 所以跳出迴圈繼續下一迴圈
                continue
            (left,right)=split
            
            # 對於每一個可能的二分結果計算gini增益
            # 左增益
            left_subDataSet = splitDataSet(dataSet, i, left)
            left_prob = len(left_subDataSet)/float(len(dataSet))
            GiniGain += left_prob * calGini(left_subDataSet)
            # 右增益
            right_subDataSet = splitDataSet(dataSet, i, right)
            right_prob = len(right_subDataSet)/float(len(dataSet))
            GiniGain += right_prob * calGini(right_subDataSet)
            if (GiniGain <= bestGiniGain):       #比較是否是最好的結果
                bestGiniGain = GiniGain         #記錄最好的結果和最好的特徵
                bestFeature = i
                bestBinarySplit=(left,right)
    return bestFeature,bestBinarySplit  
"""

#處理連續特徵值
def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1
    bestGiniGain = 1.0; bestFeature = -1;bsetValue=""
    for i in range(numFeatures):               #遍歷特徵
        featList = [example[i] for example in dataSet]     #得到特徵列
        uniqueVals = list(set(featList))       #從特徵列獲取該特徵的特徵值的set集合
        uniqueVals.sort()
        for value in uniqueVals:               #遍歷所有的特徵值
            GiniGain = 0.0
            #左基尼指數
            left_subDataSet = splitDataSet(dataSet, i, value,'lt')
            left_prob = len(left_subDataSet)/float(len(dataSet))
            GiniGain += left_prob * calGini(left_subDataSet)
            
            #右基尼指數
            right_subDataSet = splitDataSet(dataSet, i, value,'gt')
            right_prob = len(right_subDataSet)/float(len(dataSet))
            GiniGain += right_prob * calGini(right_subDataSet)
                 
            
            if (GiniGain < bestGiniGain):       #比較是否是最好的結果
                bestGiniGain = GiniGain         #記錄最好的結果和最好的特徵
                bestFeature = i
                bestValue=value
    return bestFeature,bestValue


def majorityCnt(classList):
    classCount={}
    for vote in classList:
        if vote not in classCount.keys(): classCount[vote] = 0
        classCount[vote] += 1
    sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]   #返回標籤


"""def createTree(dataSet,labels):
    classList = [example[-1] for example in dataSet]
#    9/0
    # print dataSet
    if classList.count(classList[0]) == len(classList): 
        return classList[0]       #所有的類別都一樣,就不用再劃分了
    if len(dataSet) == 1:         #如果沒有繼續可以劃分的特徵,就多數表決決定分支的類別
        # print "here"
        return majorityCnt(classList)
    bestFeat,bestBinarySplit = chooseBestFeatureToSplit(dataSet)
#    9/0
    # print bestFeat,bestBinarySplit,labels
    bestFeatLabel = labels[bestFeat]
    if bestFeat==-1:
        return majorityCnt(classList)
    myTree = {bestFeatLabel:{}}
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = list(set(featValues))
#    9/0
    for value in bestBinarySplit:
#        9/0
        subLabels = labels[:]       #拷貝防止其他地方修改  特徵標籤
        if len(value)<2:
            del(subLabels[bestFeat])
#        9/0
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)
#        9/0
    return myTree 
"""

#處理連續特徵值, labels是特徵標籤
def createTree(dataSet,labels):  
    classList = [example[-1] for example in dataSet]
    
    if classList.count(classList[0]) == len(classList): 
        return classList[0]      #所有的類別都一樣,就不用再劃分了
    if len(dataSet) == 1:        #如果沒有繼續可以劃分的特徵,就多數表決決定分支的類別
        return majorityCnt(classList)
    
    bestFeat,bestValue = chooseBestFeatureToSplit(dataSet)
    bestFeatLabel = labels[bestFeat]
    if bestFeat==-1:
        return majorityCnt(classList)
    myTree = {bestFeatLabel:{}}
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = list(set(featValues))
    subLabels = labels[:]
    
    myTree[bestFeatLabel][bestFeatLabel+'<='+str(round(float(bestValue),3))] = createTree(splitDataSet(dataSet, bestFeat, bestValue,'lt'),subLabels)
    myTree[bestFeatLabel][bestFeatLabel+'>'+str(round(float(bestValue),3))] = createTree(splitDataSet(dataSet, bestFeat, bestValue,'gt'),subLabels)
    
    return myTree  
#完美沒有問題!!!


####測試分類
#由於在Tree中,連續值特徵的名稱改為了feature<=value的形式  
#因此對於這類特徵,需要利用正則表示式進行分割,獲得特徵名以及分割閾值(其他方法也可以)  
def classify(inputTree,featLabels,testVec):  
    firstStr=inputTree.values()[0].keys()     #第一個為最佳分類特徵      #連續值    
    if '<=' not in firstStr[0]:
        firstStr.reverse()

    
    featvalue=float(re.compile("(<=.+)").search(firstStr[0]).group()[2:])   #例子中的97      
    featkey=re.compile("(.+<=)").search(firstStr[0]).group()[:-2]           #例子中的money       
    featIndex=featLabels.index(featkey)   #特徵列表中當前特徵標籤位置

    if testVec[featIndex]<=featvalue:    #測試樣本中對應位置的值
        secondDict=inputTree.values()[0][firstStr[0]]
        if type(secondDict).__name__=='dict':  
            classLabel=classify(secondDict,featLabels,testVec)    #遞迴呼叫
        else:  
            classLabel=secondDict
    else:
        secondDict=inputTree.values()[0][firstStr[1]]
        if type(secondDict).__name__=='dict':  
            classLabel=classify(secondDict,featLabels,testVec)    #遞迴呼叫
        else:  
            classLabel=secondDict  
#else:離散的先不考慮  
    return classLabel    #返回分類標籤
  
  
def testing(myTree,data_test,labels):  
    error=0.0  
    for i in range(len(data_test)):  #一個個的測試
        if classify(myTree,labels,data_test[i])!=data_test[i][-1]:  #如果測試的結果與實際的標籤不同  
            error+=1    
    print 'myTree %f'%((len(data_test)-error)/len(data_test))   #注意這裡的%f 之前是%d返回整型 
#   9/0
    return None  



df=pd.read_csv('C:/Users/test_5.csv')  
data=df.values[:280,1:].tolist()        #從0行開始 從1列開始  data包含特徵資料和類別值

data_full=data[:]  
data_test=df.values[280:,1:].tolist()     #劃分測試集
#test_length=len(data_test)
labels=df.columns.values[1:-1].tolist()  #特徵標籤 color root knocks texture navel touch 400個
labels_full=labels[:]  
myTree=createTree(data,labels)  

testing(myTree,data_test,labels_full)  
  
import treePlotter  
treePlotter.createPlot(myTree)