1. 程式人生 > >基於決策樹預測隱形眼鏡型別

基於決策樹預測隱形眼鏡型別

隱形眼鏡資料集是著名的資料集,它包含很多患者眼部狀況的觀察條件以及醫生推薦的隱形眼鏡型別。隱形眼鏡的型別包括硬材質、軟材質以及不適合佩戴隱形眼鏡。資料集如下圖所示,第一列代表年齡‘age’,第二列代表醫生的建議‘prescript’,第三列代表是否散光‘astigmatic’,第四列代表戴眼鏡的頻率‘tearRate’。

隱形眼鏡資料集

 1.匯入資料集,將資料集轉換到列表中

fr = open('lenses.txt')
lenses = [line.strip().split('\t') for line in fr.readlines()]
lensesLabels = ['age','prescript','astigmatic','tearRate']
lenses

執行結果:
[['young', 'myope', 'no', 'reduced', 'no lenses'],
 ['young', 'myope', 'no', 'normal', 'soft'],
 ['young', 'myope', 'yes', 'reduced', 'no lenses'],
 ['young', 'myope', 'yes', 'normal', 'hard'],
 ['young', 'hyper', 'no', 'reduced', 'no lenses'],
 ['young', 'hyper', 'no', 'normal', 'soft'],
 ['young', 'hyper', 'yes', 'reduced', 'no lenses'],
 ['young', 'hyper', 'yes', 'normal', 'hard'],
 ['pre', 'myope', 'no', 'reduced', 'no lenses'],
 ['pre', 'myope', 'no', 'normal', 'soft'],
 ['pre', 'myope', 'yes', 'reduced', 'no lenses'],
 ['pre', 'myope', 'yes', 'normal', 'hard'],
 ['pre', 'hyper', 'no', 'reduced', 'no lenses'],
 ['pre', 'hyper', 'no', 'normal', 'soft'],
 ['pre', 'hyper', 'yes', 'reduced', 'no lenses'],
 ['pre', 'hyper', 'yes', 'normal', 'no lenses'],
 ['presbyopic', 'myope', 'no', 'reduced', 'no lenses'],
 ['presbyopic', 'myope', 'no', 'normal', 'no lenses'],
 ['presbyopic', 'myope', 'yes', 'reduced', 'no lenses'],
 ['presbyopic', 'myope', 'yes', 'normal', 'hard'],
 ['presbyopic', 'hyper', 'no', 'reduced', 'no lenses'],
 ['presbyopic', 'hyper', 'no', 'normal', 'soft'],
 ['presbyopic', 'hyper', 'yes', 'reduced', 'no lenses'],
 ['presbyopic', 'hyper', 'yes', 'normal', 'no lenses']]

2.計算原始資料夏農熵

#計算原始資料的夏農熵
import numpy as np
import math
from math import log
def shannonEntropy(dataSet):
    num = len(dataSet)
    classCount = {}
    for a in dataSet:
        label = a[-1]#最後一列為類別標籤
        classCount[label] = classCount.get(label,0)+1
    shangnon = 0.0
    for key in classCount:
        prob = float(classCount[key])/num
        shangnon += -prob*log(prob,2)#夏農熵計算公式
    return shangnon

shannonEntropy(lenses)
執行結果:1.3260875253642983

3.劃分資料集

#劃分資料集
def splitDataSet(dataSet,feature_index,feature_value):
    subDataSet = []
    for b in dataSet:
        if b[feature_index]==feature_value:
            temp = b[:feature_index]#注意這裡不能直接用del刪除而應該用切片,用del原資料集會改變
            temp.extend(b[feature_index+1:])
            subDataSet.append(temp)
    return subDataSet

4.選擇根節點

#選擇根節點
def selectRootNode(dataSet):
    baseEntropy = shannonEntropy(dataSet)#計算原始夏農熵
    numFeatures = len(dataSet[0])-1#特徵個數
    maxInfoGain = 0.0;bestFeature = 0
    for i in range(numFeatures):
        featList = [example[i] for example in dataSet]
        uniqVals = set(featList)
        newEntropy = 0.0
        for j in uniqVals:
            subDataSet = splitDataSet(dataSet,i,j)
            prob = len(subDataSet)/float(len(dataSet))
            newEntropy += prob * shannonEntropy(subDataSet)
        infoGain = baseEntropy - newEntropy#資訊增益
        if(infoGain>maxInfoGain):
            maxInfoGain = infoGain
            bestFeature = i
    return bestFeature

5.構建樹結構

#選擇根節點
def selectRootNode(dataSet):
    baseEntropy = shannonEntropy(dataSet)#計算原始夏農熵
    numFeatures = len(dataSet[0])-1#特徵個數
    maxInfoGain = 0.0;bestFeature = 0
    for i in range(numFeatures):
        featList = [example[i] for example in dataSet]
        uniqVals = set(featList)
        newEntropy = 0.0
        for j in uniqVals:
            subDataSet = splitDataSet(dataSet,i,j)
            prob = len(subDataSet)/float(len(dataSet))
            newEntropy += prob * shannonEntropy(subDataSet)
        infoGain = baseEntropy - newEntropy#資訊增益
        if(infoGain>maxInfoGain):
            maxInfoGain = infoGain
            bestFeature = i
    return bestFeature

lensesLabels = ['age', 'prescript', 'astigmatic','tearRate']
myTree = createTree(lenses,lensesLabels)
myTree
執行結果:{'tearRate': {'normal': {'astigmatic': {'no': {'age': {'young': 'soft',
      'pre': 'soft',
      'presbyopic': {'prescript': {'hyper': 'soft', 'myope': 'no lenses'}}}},
    'yes': {'prescript': {'hyper': {'age': {'young': 'hard',
        'pre': 'no lenses',
        'presbyopic': 'no lenses'}},
      'myope': 'hard'}}}},
  'reduced': 'no lenses'}}

6.使用樹結構執行分類

def classifier(myTree,featLabels,testVec):
    firstFeat = list(myTree.keys())[0]
    secondDict = myTree[firstFeat]
    featIndex = featLabels.index(firstFeat)
    for key in secondDict.keys():
        if testVec[featIndex] == key:
            if type(secondDict[key]).__name__ == 'dict':
                classLabel = classifier(secondDict[key],featLabels,testVec)
            else:classLabel = secondDict[key]
    return classLabel

classifier(myTree, ['age','prescript','astigmatic','tearRate'],['young','myope','yes','normal'])
執行結果:'hard'

7.畫樹形圖,這裡用Graphviz和pydotplus畫,資料集需要為數字

#將屬性用數字代表,'young'=0,'pre'=1,'presbyopic=2';'myope=0','hyper=1';'no'=0,'yes'=1;'reduced'=0,'normal'=1
a = np.array([0 if line[0]=='young' else 1 if line[0]=='pre' else 2 for line in lenses])
b = np.array([0 if line[1]=='myope' else 1 for line in lenses])
c = np.array([0 if line[2]=='no' else 1 for line in lenses])
d = np.array([0 if line[3]=='reduced' else 1 for line in lenses])
e = [a,b,c,d]
data = np.array(e).T
data
執行結果:
array([[0, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 1, 0],
       [0, 0, 1, 1],
       [0, 1, 0, 0],
       [0, 1, 0, 1],
       [0, 1, 1, 0],
       [0, 1, 1, 1],
       [1, 0, 0, 0],
       [1, 0, 0, 1],
       [1, 0, 1, 0],
       [1, 0, 1, 1],
       [1, 1, 0, 0],
       [1, 1, 0, 1],
       [1, 1, 1, 0],
       [1, 1, 1, 1],
       [2, 0, 0, 0],
       [2, 0, 0, 1],
       [2, 0, 1, 0],
       [2, 0, 1, 1],
       [2, 1, 0, 0],
       [2, 1, 0, 1],
       [2, 1, 1, 0],
       [2, 1, 1, 1]])
#畫樹形圖
from sklearn import tree
clf = tree.DecisionTreeClassifier()
target =np.array([line[-1] for line in lenses])
clf = clf.fit(data,target)
import pydotplus
dot_data = tree.export_graphviz(clf, out_file=None)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("lenses.pdf")