1. 程式人生 > >簡單優化的Apriori演算法進行關聯規則分析

簡單優化的Apriori演算法進行關聯規則分析

實驗的時候發現演算法主要時間來自於C1 -> L1的構建,因此對L1的產生進行了優化,簡單記錄

def loadDataSet(filename):
    f= open(filename,'r')
    dataSet = []
    for line in f.readlines():
        l = line.split('{')[1][:-3].split(',')
        dataSet.append(l)
    return dataSet

def createL1(dataSet,minsupport = 0.5):
    C1 = {}
    C2 = []
    supportData = {}
    for
transaction in dataSet: for item in transaction: C1[item] = C1.get(item,0) + 1 l = float(len(dataSet)) print('樣本數量為'+str(l)) p = l* minsupport for v in C1.items(): supportData[frozenset([v[0]])] = float(v[1])/l if(v[1] >= p): C2.append([v[0
]]) return list(map(frozenset,C2)),supportData def scanD(D,Ck,minsupport): ssCnt = {} for tid in D: for can in Ck: if can.issubset(tid): if can not in ssCnt: ssCnt[can] = 1 else: ssCnt[can] += 1 print('統計中') numItems = float(len(D)) retList = [] supportData = {} for
key in ssCnt: support = ssCnt[key]/numItems if support >= minsupport: retList.insert(0,key) supportData[key] = support return retList ,supportData def aprioriGen(Lk,k): retList = [] lenLk = len(Lk) for i in range(lenLk): for j in range(i+1,lenLk): L1 = list(Lk[i])[:k-2]; L2 =list(Lk[j])[:k-2] L1.sort();L2.sort() if L1 == L2: retList.append(Lk[i] | Lk[j]) return retList def apriori(dataSet,minSupport): D = list(map(set,dataSet)) L1,supportData = createL1(dataSet,minSupport) print('L1,SupportData1 created') #L1,supportData = scanD(D,C1,minSupport) L = [L1] k = 2 while(len(L[k-2])>0): Ck = aprioriGen(L[k-2],k) Lk ,supk = scanD(D,Ck,minSupport) supportData.update(supk) L.append(Lk) k+=1 return L,supportData def generateRules(L,supportData,minConf = 0.7): bigRuleList = [] for i in range(1,len(L)): for freqSet in L[i]: H1 = [frozenset([item]) for item in freqSet] if(i>1): rulesFromConseq(freqSet,H1,supportData,bigRuleList,minConf) else: calcConf(freqSet,H1,supportData,bigRuleList,minConf) return bigRuleList def calcConf(freqSet,H,supportData,brl,minconf = 0.7): prunedH = [] for conseq in H: conf = supportData[freqSet]/supportData[freqSet- conseq] if(conf >= minconf): print(str(freqSet-conseq)+'--->'+str(conseq)+' 可信度為:'+str(conf)) brl.append((freqSet-conseq,conseq,conf)) prunedH.append(conseq) return prunedH def rulesFromConseq(freqSet,H,supportData,brl,minConf = 0.7): m = len(H[0]) #print(m) #print(freqSet) if(len(freqSet)>(m+1)): #print(Hmp1) Hmp1 = calcConf(freqSet,H,supportData,brl,minConf) Hmp1 = aprioriGen(Hmp1,m+1) if(len(Hmp1)>1): rulesFromConseq(freqSet,Hmp1,supportData,brl,minConf) if __name__ == '__main__': filename = 'Groceries.csv' #dataSet = loadDataSet(filename) dataSet = [line.split() for line in open('kosarak.dat').readlines()] L,supportData = apriori(dataSet,0.05) rules = generateRules(L,supportData, 0.9)