1. 程式人生 > >小白向Apriori演算法Python實現

小白向Apriori演算法Python實現

  參考部落格:http://www.cnblogs.com/llhthinker/p/6719779.html

  

  學習的別人的程式碼,用Python實現的Apriori演算法,演算法介紹見https://www.cnblogs.com/1113127139aaa/p/9926507.html

  內容是實現Apriori演算法的流程,資料是簡單的測試陣列,因為自己比較菜所以僅是為了自己複習寫了很水的註釋,如果有像我一樣的小白可以參考,先把完成的部分貼上來,原部落格有原來博主的註釋

  

def load_data_set():
    """
   載入一個示例集合
    Returns: 
        A data set: 一個購物列表,每個項中有不同的商品item
    
""" data_set = [['l1', 'l2', 'l5'], ['l2', 'l4'], ['l2', 'l3'], ['l1', 'l2', 'l4'], ['l1', 'l3'], ['l2', 'l3'], ['l1', 'l3'], ['l1', 'l2', 'l3', 'l5'], ['l1', 'l2', 'l3']] return data_set def create_C1(data_set): """ 掃描資料集,建立元素個數為1的項集C1,作為頻繁項集的候選項集C1 """
C1 = set() for t in data_set: for item in t: item_set = frozenset([item]) """ 由於要使用字典(support_data)記錄項集的支援度,需要用項集作為key, 而可變集合無法作為字典的key,因此在合適時機應將項集轉為固定集合frozenset。 或者另一種用法: for item in t: C1.append([item]) C1.sort() return map(frozenset,C1)
""" C1.add(item_set) return C1 def is_apriori(Ck_item, Lksub1): """ 進行剪枝,如果滿足APriori,即滿足支援度,返回True 否則返回False,刪除 """ for item in Ck_item: sub_Ck = Ck_item - frozenset([item]) if sub_Ck not in Lksub1: return False return True def create_Ck(Lksub1, k): """ 由Lk-1生成Ck 具體實現方法是在Lk-1中,對所有兩個項集之間只有最後一項item不同的項集的交集 """ Ck = set() len_Lksub1 = len(Lksub1) list_Lksub1 = list(Lksub1) for i in range(len_Lksub1): for j in range(1, len_Lksub1): l1 = list(list_Lksub1[i]) l2 = list(list_Lksub1[j]) l1.sort() l2.sort() if l1[0:k-2] == l2[0:k-2]: Ck_item = list_Lksub1[i] | list_Lksub1[j] #求並集 # 剪枝 if is_apriori(Ck_item, Lksub1): Ck.add(Ck_item) return Ck def generate_Lk_by_Ck(data_set, Ck, min_support, support_data): """ 由候選頻繁k項集Ck生成頻繁k項集Lk 主要內容是對Ck中的每個項集計算支援度,去掉不滿足最低支援度的項集 返回Lk,記錄support_data """ Lk = set() item_count = {} for t in data_set: #掃描所有商品,計算候選頻繁項集C中項集的支援度,t為訂單 for item in Ck: #item為C中的項集 if item.issubset(t): #如果C中的項集是t訂單的子集 if item not in item_count: #如果item_count中還沒有這個項集,計數為1 item_count[item] = 1 else: #如果item_count中已經有了這個項集,計數加1 item_count[item] += 1 t_num = float(len(data_set)) #t_num,訂單總數 for item in item_count: #item_count中已經有了所有的候選項集,計算支援度 if (item_count[item] / t_num) >= min_support: Lk.add(item) #滿足最小支援度的項集add進頻繁項集Lk中 support_data[item] = item_count[item] / t_num #記錄支援度,返回Lk return Lk def generate_L(data_set, k, min_support): """ 生成頻繁集Lk,通過呼叫generate_Lk_by_Ck 從C1開始共進行k輪迭代,將每次生成的Lk都append到L中,同時記錄支援度support_data """ support_data = {} C1 = create_C1(data_set) #生成C1 L1 = generate_Lk_by_Ck(data_set, C1, min_support, support_data) #由C1生成L1,呼叫generate_Lk_by_Ck函式 Lksub1 = L1.copy() L = [] L.append(Lksub1) for i in range(2, k+1): #由k已知進行重複迭代 Ci = create_Ck(Lksub1, i) #由Lk生成Lk+1,呼叫create_Ck函式 Li = generate_Lk_by_Ck(data_set, Ci, min_support, support_data) Lksub1 = Li.copy() L.append(Lksub1) return L, support_data def generate_big_rules(L, support_data, min_conf): """ Generate big rules from frequent itemsets. Args: L: The list of Lk. support_data: A dictionary. The key is frequent itemset and the value is support. min_conf: Minimal confidence. Returns: big_rule_list: A list which contains all big rules. Each big rule is represented as a 3-tuple. """ big_rule_list = [] sub_set_list = [] for i in range(0, len(L)): for freq_set in L[i]: for sub_set in sub_set_list: if sub_set.issubset(freq_set): conf = support_data[freq_set] / support_data[freq_set - sub_set] big_rule = (freq_set - sub_set, sub_set, conf) if conf >= min_conf and big_rule not in big_rule_list: # print freq_set-sub_set, " => ", sub_set, "conf: ", conf big_rule_list.append(big_rule) sub_set_list.append(freq_set) return big_rule_list if __name__ == "__main__": #主程式入口 """ Test """ data_set = load_data_set() #載入測試資料集 L, support_data = generate_L(data_set, k=3, min_support=0.2) #資料集中最大商品數為3,給定預設最低支援度為0.2,呼叫generate_L函式 big_rules_list = generate_big_rules(L, support_data, min_conf=0.7) for Lk in L: print ("="*50) print ("frequent " + str(len(list(Lk)[0])) + "-itemsets\t\tsupport") print ("="*50) for freq_set in Lk: print (freq_set, support_data[freq_set]) #print頻繁k項集和支援度 print print ("Big Rules") for item in big_rules_list: print (item[0], "=>", item[1], "conf: ", item[2])