基於使用者最近鄰模型的協同過濾演算法的Python程式碼實現
阿新 • • 發佈:2018-12-30
#------------------------------------------------------------------------------- # Name: PearsonUserNeighCF # Purpose: Personalized Recommendation # # Author: Jinkun Wang # Email: [email protected], if you have any question about the # code, please do not hesitate to contact me. # # Created: 10/09/2014 # Copyright: (c) Jinkun Wang 2014 #------------------------------------------------------------------------------- from math import sqrt import numpy as np import matplotlib.pyplot as plt def loadData(): trainSet = {} testSet = {} movieUser = {} u2u = {} TrainFile = 'ml-100k/u1.base' #指定訓練集 TestFile = 'ml-100k/u1.test' #指定測試集 #載入訓練集,生成電影使用者的倒排序表 movieUser for line in open(TrainFile): (userId, itemId, rating, _) = line.strip().split('\t') trainSet.setdefault(userId,{}) trainSet[userId].setdefault(itemId,float(rating)) movieUser.setdefault(itemId,[]) movieUser[itemId].append(userId.strip()) #防止測試集有訓練集中沒有出現過的專案 item_in_train = [] for m in movieUser.keys(): item_in_train.append(m) #載入測試集 for line in open(TestFile): (userId, itemId, rating, _) = line.strip().split('\t') testSet.setdefault(userId,{}) testSet[userId].setdefault(itemId,float(rating)) return trainSet,testSet,movieUser,item_in_train #計算一個使用者的平均評分 def getAverageRating(user): average = (sum(trainSet[user].values()) * 1.0) / len(trainSet[user].keys()) return average #計算使用者相似度 def UserSimPearson(trainSet): userSim = {} for u1 in trainSet.keys(): userSim.setdefault(u1,{}) u1_rated = trainSet[u1].keys() for u2 in trainSet.keys(): userSim[u1].setdefault(u2,0) if u1 != u2: u2_rated = trainSet[u2].keys() co_rated = list(set(u1_rated).intersection(set(u2_rated))) if co_rated == []: userSim[u1][u2] = 0 else: num = 0 #皮爾遜計算公式的分子部分 den1 = 0 #皮爾遜計算公式的分母部分1 den2 = 0 #皮爾遜計算公式的分母部分2 sigma_u1_m = 0 #計算使用者u1對共同評價專案的評分均值 sigma_u2_m = 0 #計算使用者u2對共同評價專案的評分均值 for m in co_rated: sigma_u1_m += trainSet[u1][m] sigma_u2_m += trainSet[u2][m] ave_u1_m = sigma_u1_m / len(co_rated) ave_u2_m = sigma_u2_m / len(co_rated) for m in co_rated: num += (trainSet[u1][m] - ave_u1_m) * (trainSet[u2][m] - ave_u2_m) * 1.0 den1 += pow(trainSet[u1][m] - ave_u1_m, 2) * 1.0 den2 += pow(trainSet[u2][m] - ave_u2_m, 2) * 1.0 den1 = sqrt(den1) den2 = sqrt(den2) if den1 == 0 or den2 ==0 : userSim[u1][u2] = 0 else: userSim[u1][u2] = num / (den1 * den2) else: userSim[u1][u2] = 0 return userSim #對使用者相似度表進行排序處理 def sortSimMatrix(userSim): neighSorted = {} for u in userSim.keys(): neigh_sorted = sorted(userSim[u].items(), key = lambda x:x[1], reverse = True) for key, value in neigh_sorted: neighSorted.setdefault(u,[]) neighSorted[u].append(key) return neighSorted #尋找使用者最近鄰並生成推薦結果;與測試集比較獲得演算法的準確度 def getAccuracyMetric(N,trainSet,testSet,movieUser,neighSorted, userSim, item_in_train): #尋找使用者最近鄰並生成推薦結果 pred = {} for user, item in testSet.items(): #對測試集中的每個使用者 pred.setdefault(user,{}) #生成使用者User的預測空列表 ave_u_rating = getAverageRating(user) neigh_uninterseced = neighSorted[user] #獲取使用者user的鄰居集合(已按相似度大小降序排列) for m in item.keys(): if m not in item_in_train: pred[user][m] = ave_u_rating else: rated_m_user = movieUser[m] #測試集中評價過電影m的使用者 neigh_intersected = sorted(rated_m_user,key = lambda x:neigh_uninterseced.index(x)) if len(neigh_intersected) > N: neigh = neigh_intersected[0:N] else: neigh = neigh_intersected neighRating = 0 neighSimSum = 0 for neighUser in neigh: neighRating += (trainSet[neighUser][m] - getAverageRating(neighUser)) * userSim[user][neighUser] neighSimSum += abs(userSim[user][neighUser]) if neighSimSum == 0: pred[user][m] = ave_u_rating else: pred[user][m] = ave_u_rating + (neighRating * 1.0) / neighSimSum #與測試集比較獲得演算法的準確度 mae = 0 rmse = 0 error_sum = 0 sqrError_sum = 0 setSum = 0 for user,item in pred.items(): for m in item.keys(): error_sum += abs(pred[user][m] - testSet[user][m]) sqrError_sum += pow(pred[user][m] - testSet[user][m],2) setSum += 1 mae = error_sum / setSum rmse = sqrt(sqrError_sum / setSum) return mae, rmse if __name__ == '__main__': print '正在載入資料...' trainSet,testSet,movieUser,item_in_train = loadData() print '正在計算使用者間相似度...' userSim = UserSimPearson(trainSet) print '對相似度列表按相似度大小進行排列...' neighSorted = sortSimMatrix(userSim) print '正在尋找最近鄰...' NeighborSize = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] MAE = [] RMSE = [] for N in NeighborSize: #對不同的近鄰數 mae, rmse = getAccuracyMetric(N,trainSet,testSet,movieUser,neighSorted, userSim, item_in_train) #獲得演算法推薦精度 MAE.append(mae) RMSE.append(rmse) plt.subplot(211) plt.plot(NeighborSize,MAE) plt.xlabel('NeighborSize') plt.ylabel('Mean Absolute Error') plt.title('Pearson User Neighbor Model Collaborative Filtering') plt.subplot(212) plt.plot(NeighborSize,RMSE) plt.xlabel('NeighborSize') plt.ylabel('Root Mean Square Error') plt.title('Pearson User Neighbor Model Collaborative Filtering') plt.show() raw_input('按任意鍵繼續...')