決策樹ID3演算法和C4.5演算法實戰
阿新 • • 發佈:2018-12-24
老師給的題目:
程式碼實現【兩種演算法合在一個檔案裡】:
from numpy import * def createDataSet(): dataSet = [[1, 1, 1, 0, 'no'], [1, 1, 1, 1, 'no'], [0, 1, 1, 0, 'yes'], [-1, 0, 1, 0, 'yes'], [-1,-1,0,0,'yes'], [-1,-1,0,1,'no'], [0,-1,0,1,'yes'], [1,0,1,0,'no'], [1,-1,0,0,'yes'], [-1,0,0,0,'yes'], [1,0,0,1,'yes'], [0,0,1,1,'yes'], [0,1,0,0,'yes'], [-1,0,1,1,'no']] labels = ['weather','temperature','humidity','wind speed','activity'] return dataSet, labels #計算資料集的entropy def calcEntropy(dataSet): totalNum = len(dataSet) labelNum = {} entropy = 0 for data in dataSet: label = data[-1] if label in labelNum: labelNum[label] += 1 else: labelNum[label] = 1 for key in labelNum: p = labelNum[key] / totalNum entropy -= p * log2(p) return entropy def calcEntropyForFeature(featureList): totalNum = len(featureList) dataNum = {} entropy = 0 for data in featureList: if data in dataNum: dataNum[data] += 1 else: dataNum[data] = 1 for key in dataNum: p = dataNum[key] / totalNum entropy -= p * log2(p) return entropy #選擇最優劃分屬性ID3 def chooseBestFeatureID3(dataSet, labels): bestFeature = 0 initialEntropy = calcEntropy(dataSet) biggestEntropyG = 0 for i in range(len(labels)): currentEntropy = 0 feature = [data[i] for data in dataSet] subSet = splitDataSetByFeature(i, dataSet) totalN = len(feature) for key in subSet: prob = len(subSet[key]) / totalN currentEntropy += prob * calcEntropy(subSet[key]) entropyGain = initialEntropy - currentEntropy if(biggestEntropyG < entropyGain): biggestEntropyG = entropyGain bestFeature = i return bestFeature #選擇最優劃分屬性C4.5 def chooseBestFeatureC45(dataSet, labels): bestFeature = 0 initialEntropy = calcEntropy(dataSet) biggestEntropyGR = 0 for i in range(len(labels)): currentEntropy = 0 feature = [data[i] for data in dataSet] entropyFeature = calcEntropyForFeature(feature) subSet = splitDataSetByFeature(i, dataSet) totalN = len(feature) for key in subSet: prob = len(subSet[key]) / totalN currentEntropy += prob * calcEntropy(subSet[key]) entropyGain = initialEntropy - currentEntropy entropyGainRatio = entropyGain / entropyFeature if(biggestEntropyGR < entropyGainRatio): biggestEntropyGR = entropyGainRatio bestFeature = i return bestFeature def splitDataSetByFeature(i, dataSet): subSet = {} feature = [data[i] for data in dataSet] for j in range(len(feature)): if feature[j] not in subSet: subSet[feature[j]] = [] splittedDataSet = dataSet[j][:i] splittedDataSet.extend(dataSet[j][i + 1:]) subSet[feature[j]].append(splittedDataSet) return subSet def checkIsOneCateg(newDataSet): flag = False categoryList = [data[-1] for data in newDataSet] category = set(categoryList) if(len(category) == 1): flag = True return flag def majorityCateg(newDataSet): categCount = {} categList = [data[-1] for data in newDataSet] for c in categList: if c not in categCount: categCount[c] = 1 else: categCount[c] += 1 sortedCateg = sorted(categCount.items(), key = lambda x:x[1], reverse = True) return sortedCateg[0][0] #建立ID3樹 def createDecisionTreeID3(decisionTree, dataSet, tmplabels): labels=[] for tmp in tmplabels: labels.append(tmp) bestFeature = chooseBestFeatureID3(dataSet, labels) decisionTree[labels[bestFeature]] = {} currentLabel = labels[bestFeature] subSet = splitDataSetByFeature(bestFeature, dataSet) del(labels[bestFeature]) newLabels = labels[:] for key in subSet: newDataSet = subSet[key] flag = checkIsOneCateg(newDataSet) if(flag == True): decisionTree[currentLabel][key] = newDataSet[0][-1] else: if (len(newDataSet[0]) == 1): #無特徵值可劃分 decisionTree[currentLabel][key] = majorityCateg(newDataSet) else: decisionTree[currentLabel][key] = {} createDecisionTreeID3(decisionTree[currentLabel][key], newDataSet, newLabels) # 建立C4.5樹 def createDecisionTreeC45(decisionTree, dataSet, tmplabels): labels=[] for tmp in tmplabels: labels.append(tmp) bestFeature = chooseBestFeatureC45(dataSet, labels) decisionTree[labels[bestFeature]] = {} currentLabel = labels[bestFeature] subSet = splitDataSetByFeature(bestFeature, dataSet) del (labels[bestFeature]) newLabels = labels[:] for key in subSet: newDataSet = subSet[key] flag = checkIsOneCateg(newDataSet) if (flag == True): decisionTree[currentLabel][key] = newDataSet[0][-1] else: if (len(newDataSet[0]) == 1): # 無特徵值可劃分 decisionTree[currentLabel][key] = majorityCateg(newDataSet) else: decisionTree[currentLabel][key] = {} createDecisionTreeC45(decisionTree[currentLabel][key], newDataSet, newLabels) #測試資料分類 def classify(inputTree, featLabels, testVec): firstStr = list(inputTree.keys())#得到節點所代表的屬性eg:'flippers' firstStr = firstStr[0] secondDict = inputTree[firstStr]#得到該節點的子節點,是一個dict,eg:{0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}} featIndex = featLabels.index(firstStr)#得到firstStr在所給的featLabels(屬性)中的位置,以便將testVec中的值與相應的屬性對應 for key in secondDict.keys():#將testVec中的值放入決策樹中進行判斷 if testVec[featIndex] == key: if type(secondDict[key]).__name__=='dict':#如果還有子節點則繼續判斷 classLabel = classify(secondDict[key],featLabels,testVec) else: classLabel = secondDict[key]#否則返回該節點的值 return classLabel if __name__ == '__main__': dataSetID3, labelsID3 = createDataSet() testData1 = [1, 1, 1, 0] testData2 = [1,-1,0,0] bestFeatureID3 = chooseBestFeatureID3(dataSetID3, labelsID3) decisionTreeID3 = {} createDecisionTreeID3(decisionTreeID3, dataSetID3, labelsID3) print("ID3 decision tree: ", decisionTreeID3) # category1ID3 = classifyTestData(decisionTreeID3, testData1) # print(testData1 , ", classified as by ID3: " , category1ID3) # category2ID3 = classifyTestData(decisionTreeID3, testData2) # print(testData2 , ", classified as by ID3: " , category2ID3) for tmp in dataSetID3: category = classify(decisionTreeID3,labelsID3,tmp[0:4]) print(tmp[0:4], ", classified as by ID3: " , category) dataSetC45, labelsC45 = createDataSet() bestFeatureC45 = chooseBestFeatureC45(dataSetC45, labelsC45) decisionTreeC45 = {} createDecisionTreeC45(decisionTreeC45, dataSetC45, labelsC45) print("C4.5 decision tree: ", decisionTreeC45) # category1C45 = classifyTestData(decisionTreeC45, testData1) # print(testData1 , ", classified as by C4.5: " , category1C45) # category2C45 = classifyTestData(decisionTreeC45, testData2) # print(testData2 , ", classified as by C4.5: " , category2C45) for tmp in dataSetC45: category = classify(decisionTreeC45,labelsC45,tmp[0:4]) print(tmp[0:4], ", classified as by C4.5: " , category)