1. 程式人生 > >關聯規則(Association Rules)python實現

關聯規則(Association Rules)python實現

前言

試著用python實現關聯規則(Apriori演算法),在生成關聯規則的時候遇到問題,不知道怎麼遍歷frequent itemsets中的所有關聯規則。轉專業的一隻小菜雞,初學程式碼,寫的很簡陋,希望各位大牛能指出不足之處。

程式碼

輸入是num個隨機長度、隨機字母組合的列表。

通過字典輸出Frequent itemsets,字典的鍵是itemset,值是出現的次數。

import random
import numpy as np

class Association_rules:
	def __init__(self,minSupport=0.2,minConfidence=0.5):
		'''
		minSuport:最小支援度
		minConfidence:最小置信度
		dataset:資料集
		count:存放frequent itemsets 以及 support
		associationRules:滿足minConfidence的關聯規則
		num:元素數量
		threshold = num*minSupport:由num和minSupport算出的閾值
		'''
		self.minSupport = minSupport
		self.minConfidence = minConfidence
		self.dataset = None
		self.count = None
		self.associationRules = None
		self.num = 0
		self.threshold = 0

	#計算frequent itemset
	def countItem(self,upDict,elength):
		currentDict = {}
		element = list(upDict.keys())
		for i in range(len(element)-1):
			for j in range(i+1,len(element)):
				tmp = set(list(element[i]))
				tmp.update(list(element[j]))
				if len(tmp) > elength:
					continue
				if tmp in list(set(item) for item in currentDict.keys()):
					continue
				for item in self.dataset:
					if tmp.issubset(set(item)):
						if tmp in list(set(item) for item in currentDict.keys()):
							currentDict[tuple(tmp)] += 1
						else:
							currentDict[tuple(tmp)] = 1
		for item in list(currentDict.keys()):
			if currentDict[item] < self.threshold:
				del currentDict[item]
				#剪枝
		if len(list(currentDict.keys())) < 1:
			return None
		else:
			return currentDict

	#生成frequent itemsets
	def fit(self,dataset):
		self.dataset = dataset
		count = []
		count.append({})
		for item in self.dataset:
			for i in range(len(item)):
				if item[i] in list(count[0].keys()):
					count[0][item[i]] += 1
				else:
					count[0][item[i]] = 1
					self.num += 1

		self.threshold = self.num * self.minSupport

		for item in list(count[0].keys()):
			if count[0][item] < self.threshold:
				del count[0][item]
				#剪枝
		
		i = 0
		while(True):
			if len(count[i]) < 2:
				break
			else:
				tmp = self.countItem(count[i],i+2)
				if tmp == None:
					break
				else:
					count.append(tmp)
				i += 1

		self.count = count

	def frequentItemsets(self):
		#print('threshold:',self.threshold)
		for item in self.count:
			print(item)
			print()
		return self.count


#初始化資料
def set_data(num):
	dataset = []
	for i in range(num):
		number = random.randint(1,5)
		dataset.append(list(set(chr(ord('a')+random.randint(1,10)) for i in range(number))))
	return dataset


if __name__ == '__main__':
	num = 10
	dataset = set_data(num)
	for item in dataset:
				print(item)
	print()
	ar = Association_rules()
	ar.fit(dataset)
	freItemsets = ar.frequentItemsets()

結果

資料集

['g', 'c', 'i']
['f', 'i']
['f', 'b']
['d', 'i', 'j']
['g', 'e']
['c', 'i', 'b', 'd', 'k']
['k']
['k', 'c', 'b', 'g']
['g', 'k', 'h']
['g', 'i']

Frequent Itemsets

{'g': 5, 'c': 3, 'i': 5, 'f': 2, 'b': 3, 'd': 2, 'k': 4}

{('g', 'c'): 2, ('g', 'i'): 2, ('g', 'k'): 2, ('c', 'i'): 2, ('c', 'b'): 2, ('c', 'k'): 2, ('d', 'i'): 2, ('k', 'b'): 2}

{('k', 'c', 'b'): 2}

問題

因為itemset有不同的長度,因此關聯規則的可能性很多,暫時不知道如何找出一個itemset中的所有關聯規則。