python實現西瓜書《機器學習》習題4.4基尼指數決策樹,預剪枝及後剪枝
阿新 • • 發佈:2018-11-11
大神程式碼:https://blog.csdn.net/Snoopy_Yuan/article/details/69223240
昨天畫不出樹有點煩躁,隨便找了百度了一點點,還是畫不出來。
今天這道題,其實就是把資訊增益換成基尼指數,本質上的構造樹邏輯是一致的。
不過原始碼有個小錯誤,在上面連結裡已經評論了,好奇寶寶可以自己去看
不過,奇葩的是前後剪枝算出來的準確率一毛一樣,估計程式裡還有問題,以後再扣吧。。。
主程式gini_decision_tree.py
#https://blog.csdn.net/Snoopy_Yuan/article/details/69223240 import pandas as pd #data_file_encode="gb18030" #gb18030支援漢字和少數民族字元,是一二四位元組變長編碼。這麼用的時候with open需要增加encoding引數,但會報錯gb18030不能解碼 # with open相當於開啟檔案,儲存成str物件,如果出錯則關閉檔案。引數r表示只讀 with open("/Users/huatong/PycharmProjects/Data/watermelon_33.csv",mode="r") as data_file: df=pd.read_csv(data_file) import decision_tree # 取出訓練集,iloc是根據數字索引取出對應行的資訊,drop是刪除這些行之後剩餘的表格 index_train = [0, 1, 2, 5, 6, 9, 13, 14, 15, 16] #和書上80頁的訓練樣本相同 df_train = df.iloc[index_train] df_test = df.drop(index_train) # generate a full tree root = decision_tree.TreeGenerate(df_train) #decision_tree.DrawPNG(root, "decision_tree_full.png") 畫不出來 先註釋掉 print("accuracy of full tree: %.3f" % decision_tree.PredictAccuracy(root, df_test)) # 預剪枝 root = decision_tree.PrePurn(df_train, df_test) #decision_tree.DrawPNG(root, "decision_tree_pre.png") print("accuracy of pre-purning tree: %.3f" % decision_tree.PredictAccuracy(root, df_test)) # 後剪枝,先生成樹,再從底部節點開始分析 root = decision_tree.TreeGenerate(df_train) decision_tree.PostPurn(root, df_test) #decision_tree.DrawPNG(root, "decision_tree_post.png") print("accuracy of post-purning tree: %.3f" % decision_tree.PredictAccuracy(root, df_test)) # 5折交叉分析 accuracy_scores = [] n = len(df.index) k = 5 for i in range(k): m = int(n / k) test = [] for j in range(i * m, i * m + m): test.append(j) df_train = df.drop(test) df_test = df.iloc[test] root = decision_tree.TreeGenerate(df_train) # generate the tree decision_tree.PostPurn(root, df_test) # post-purning # test the accuracy pred_true = 0 for i in df_test.index: label = decision_tree.Predict(root, df[df.index == i]) if label == df_test[df_test.columns[-1]][i]: pred_true += 1 accuracy = pred_true / len(df_test.index) accuracy_scores.append(accuracy) # print the prediction accuracy result accuracy_sum = 0 print("accuracy: ", end="") for i in range(k): print("%.3f " % accuracy_scores[i], end="") accuracy_sum += accuracy_scores[i] print("\naverage accuracy: %.3f" % (accuracy_sum / k))
decision_tree.py
#被主程式執行treeGenerate時候呼叫,def用於定義函式 #節點類,包含①當前節點的屬性,例如紋理清晰? ②節點所屬分類,只對葉子節點有效 ③向下劃分的屬性取值例如色澤烏黑青綠淺白 class Node(object): #新式類 def __init__(self,attr_init=None,label_init=None,attr_down_init={}): #注意類的特殊函式前後有兩個下劃線 self.attr=attr_init self.label=label_init self.attr_down=attr_down_init #主函式,輸入引數為資料集,輸出引數為決策樹根節點Node def TreeGenerate(df): new_node=Node(None,None,{}) label_arr=df[df.columns[-1]] #好瓜這列數值,df.columns[-1]是最後一列 label_count=NodeLabel(label_arr) if label_count: #類別統計結果不為空 new_node.label=max(label_count,key=label_count.get) #取類別數目最多的類,get是返回鍵值 #如果樣本全屬於同一類別則直接返回葉節點,或如果樣本屬性集A為空則返回葉節點並標記類別為類別數最多的類,但如果樣本屬性取值相同怎麼處理? if len(label_count)==1 or len(label_arr)==0: return new_node #根據基尼指數選擇最優劃分屬性 new_node.attr,div_value=OptAttr_Gini(df) #如果屬性值為空,刪除當前屬性再遞迴 if div_value==0: value_count=ValueCount(df[new_node.attr]) for value in value_count: df_v=df[df[new_node.attr].isin([value])] dv_v=df_v.drop(new_node.attr,1) new_node.attr_down[value]=TreeGenerate(df_v) else: value_l="<=%.3f"%div_value value_r=">%.3f"%div_value df_v_l=df[df[new_node.attr]<=div_value] #左孩子 df_v_r=df[df[new_node.attr]>div_value] #右孩子 new_node.attr_down[value_l] = TreeGenerate(df_v_l) #繼續分 new_node.attr_down[value_r] = TreeGenerate(df_v_r) return new_node #統計樣本包含的類別和每個分類的個數,輸入引數是分類標籤序列,輸出序列中包含的類別和各類別總數 def NodeLabel(label_arr): label_count={} for label in label_arr: if label in label_count: label_count[label]+=1 else:label_count[label]=1 return label_count #尋找最優劃分屬性,輸入引數為資料集,輸出引數為屬性opt_attr和劃分取值div_value,div_value對離散變數取值為0,對連續變數取實際值 def OptAttr_Gini(df): gini_index=float('Inf') for attr_id in df.columns[1:-1]: gini_index_tmp,div_value_tmp=GiniIndex(df,attr_id) if gini_index_tmp<gini_index: #目標是找到最小基尼指數 gini_index_=gini_index_tmp opt_attr=attr_id div_value=div_value_tmp #print("devide according to:",opt_attr,end=' ') #print("devide value is:",div_value) 這麼寫不行,要判斷是int還是字元 return opt_attr,div_value #計算基尼指數,輸入引數為資料集、屬性值,輸出引數為基尼指數和劃分取值div_value,離散變數取0連續變數取實際值 def GiniIndex(df,attr_id): gini_index=0 div_value=0 #劃分數值 n=len(df[attr_id]) #樣本數 #對連續值變數 if df[attr_id].dtype==(float,int): sub_gini={} #儲存劃分數值和各子分類的? df=df.sort_values([attr_id],ascending=1) #按屬性這列排序,升序,這裡源程式sort函式會報錯要改成sort_values df=df.reset_index(drop=True) #sort後索引變化了,需要還原索引 data_arr=df[attr_id] label_arr=df[df.columns[-1]] for i in range(n-1): div=(data_arr[i]+data_arr[i+1])/2 #連續值屬性的劃分點集合 sub_gini[div] = ( (i+1) * Gini(label_arr[0:i+1]) / n ) \ + ( (n-i-1) * Gini(label_arr[i+1:-1]) / n ) div_value,gini_index=min(sub_gini.items(),key=lambda x:x[1]) #最lambda用於命名匿名函式 #對離散值變數 else: data_arr=df[attr_id] label_arr=df[df.columns[-1]] value_count=ValueCount(data_arr) for key in value_count: key_label_arr=label_arr[data_arr==key] gini_index+=value_count[key]*Gini(key_label_arr)/n return gini_index,div_value #計算基尼值,注意區別於基尼指數 def Gini(label_arr): gini=1 n=len(label_arr) label_count=NodeLabel(label_arr) for key in label_count: gini-=(label_count[key]/n)*(label_count[key]/n) #gini=1-p^2 return gini #根據輸入引數屬性值區分後,各分類的樣本個數 def ValueCount(data_arr): value_count={} for label in data_arr: if label in value_count: value_count[label]+=1 else: value_count[label]=1 return value_count #根據根節點預測 def Predict(root, df_sample): try: import re # using Regular Expression to get the number in string except ImportError: print("module re not found") while root.attr != None: # continuous variable if df_sample[root.attr].dtype == (float, int): # get the div_value from root.attr_down for key in list(root.attr_down): num = re.findall(r"\d+\.?\d*", key) div_value = float(num[0]) break if df_sample[root.attr].values[0] <= div_value: key = "<=%.3f" % div_value root = root.attr_down[key] else: key = ">%.3f" % div_value root = root.attr_down[key] # categoric variable else: key = df_sample[root.attr].values[0] # check whether the attr_value in the child branch if key in root.attr_down: root = root.attr_down[key] else: break return root.label #計算驗證集精度 def PredictAccuracy(root, df_test): ''' calculating accuracy of prediction on test set @param root: Node, root Node of the decision tree @param df_test: dataframe, test data set @return accuracy, float, ''' if len(df_test.index) == 0: return 0 pred_true = 0 for i in df_test.index: label = Predict(root, df_test[df_test.index == i]) if label == df_test[df_test.columns[-1]][i]: pred_true += 1 return pred_true / len(df_test.index) #預剪枝,輸入訓練集和驗證集,輸出剪枝後根節點 def PrePurn(df_train, df_test): # 生成新樹 new_node = Node(None, None, {}) label_arr = df_train[df_train.columns[-1]] label_count = NodeLabel(label_arr) if label_count: # assert the label_count isn's empty new_node.label = max(label_count, key=label_count.get) # end if there is only 1 class in current node data # end if attribution array is empty if len(label_count) == 1 or len(label_arr) == 0: return new_node # calculating the test accuracy up to current node a0 = PredictAccuracy(new_node, df_test) # get the optimal attribution for a new branching new_node.attr, div_value = OptAttr_Gini(df_train) # via Gini index # get the new branch if div_value == 0: # categoric variable value_count = ValueCount(df_train[new_node.attr]) for value in value_count: df_v = df_train[df_train[new_node.attr].isin([value])] # get sub set df_v = df_v.drop(new_node.attr, 1) # for child node new_node_child = Node(None, None, {}) label_arr_child = df_train[df_v.columns[-1]] label_count_child = NodeLabel(label_arr_child) new_node_child.label = max(label_count_child, key=label_count_child.get) new_node.attr_down[value] = new_node_child # calculating to check whether need further branching a1 = PredictAccuracy(new_node, df_test) if a1 > a0: # need branching for value in value_count: df_v = df_train[df_train[new_node.attr].isin([value])] # get sub set df_v = df_v.drop(new_node.attr, 1) new_node.attr_down[value] = TreeGenerate(df_v) else: new_node.attr = None new_node.attr_down = {} else: # continuous variable # left and right child value_l = "<=%.3f" % div_value value_r = ">%.3f" % div_value df_v_l = df_train[df_train[new_node.attr] <= div_value] # get sub set df_v_r = df_train[df_train[new_node.attr] > div_value] # for child node new_node_l = Node(None, None, {}) new_node_r = Node(None, None, {}) label_count_l = NodeLabel(df_v_l[df_v_r.columns[-1]]) label_count_r = NodeLabel(df_v_r[df_v_r.columns[-1]]) new_node_l.label = max(label_count_l, key=label_count_l.get) new_node_r.label = max(label_count_r, key=label_count_r.get) new_node.attr_down[value_l] = new_node_l new_node.attr_down[value_r] = new_node_r # calculating to check whether need further branching a1 = PredictAccuracy(new_node, df_test) if a1 > a0: # need branching new_node.attr_down[value_l] = TreeGenerate(df_v_l) new_node.attr_down[value_r] = TreeGenerate(df_v_r) else: new_node.attr = None new_node.attr_down = {} return new_node #後剪枝 def PostPurn(root, df_test): ''' pre-purning to generating a decision tree @param root: Node, root of the tree @param df_test: dataframe, the testing set for purning decision @return accuracy score through traversal the tree ''' # leaf node if root.attr == None: return PredictAccuracy(root, df_test) # calculating the test accuracy on children node a1 = 0 value_count = ValueCount(df_test[root.attr]) for value in list(value_count): df_test_v = df_test[df_test[root.attr].isin([value])] # get sub set if value in root.attr_down: # root has the value a1_v = PostPurn(root.attr_down[value], df_test_v) else: # root doesn't have value a1_v = PredictAccuracy(root, df_test_v) if a1_v == -1: # -1 means no pruning back from this child return -1 else: a1 += a1_v * len(df_test_v.index) / len(df_test.index) # calculating the test accuracy on this node node = Node(None, root.label, {}) a0 = PredictAccuracy(node, df_test) # check if need pruning if a0 >= a1: root.attr = None root.attr_down = {} return a0 else: return -1 def DrawPNG(root, out_file): import graphviz ''' visualization of decision tree from root. @param root: Node, the root node for tree. @param out_file: str, name and path of output file ''' try: from pydotplus import graphviz except ImportError: print("module pydotplus.graphviz not found") g = graphviz.Dot() # generation of new dot TreeToGraph(0, g, root) g2 = graphviz.graph_from_dot_data(g.to_string()) g2.write_png(out_file) def TreeToGraph(i, g, root): ''' build a graph from root on @param i: node number in this tree @param g: pydotplus.graphviz.Dot() object @param root: the root node @return i: node number after modified # @return g: pydotplus.graphviz.Dot() object after modified @return g_node: the current root node in graphviz ''' try: from pydotplus import graphviz #pydotplus和graphviz都要安裝 except ImportError: print("module pydotplus.graphviz not found") if root.attr == None: g_node_label = "Node:%d\n好瓜:%s" % (i, root.label) else: g_node_label = "Node:%d\n好瓜:%s\n屬性:%s" % (i, root.label, root.attr) g_node = i g.add_node(graphviz.Node(g_node, label=g_node_label)) for value in list(root.attr_down): i, g_child = TreeToGraph(i + 1, g, root.attr_down[value]) g.add_edge(graphviz.Edge(g_node, g_child, label=value)) return i, g_node