1. 程式人生 > >基於使用者最近鄰模型的協同過濾演算法的Python程式碼實現

基於使用者最近鄰模型的協同過濾演算法的Python程式碼實現

#-------------------------------------------------------------------------------
# Name:        PearsonUserNeighCF
# Purpose:     Personalized Recommendation
#
# Author:      Jinkun Wang
# Email:       [email protected], if you have any question about the
#              code, please do not hesitate to contact me.
#
# Created:     10/09/2014
# Copyright:   (c) Jinkun Wang 2014
#-------------------------------------------------------------------------------
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt

def loadData():
    trainSet = {}
    testSet = {}
    movieUser = {}
    u2u = {}

    TrainFile = 'ml-100k/u1.base'   #指定訓練集
    TestFile = 'ml-100k/u1.test'    #指定測試集

    #載入訓練集,生成電影使用者的倒排序表 movieUser
    for line in open(TrainFile):
        (userId, itemId, rating, _) = line.strip().split('\t')
        trainSet.setdefault(userId,{})
        trainSet[userId].setdefault(itemId,float(rating))
        movieUser.setdefault(itemId,[])
        movieUser[itemId].append(userId.strip())

    #防止測試集有訓練集中沒有出現過的專案
    item_in_train = []
    for m in movieUser.keys():
        item_in_train.append(m)

    #載入測試集
    for line in open(TestFile):
        (userId, itemId, rating, _) = line.strip().split('\t')
        testSet.setdefault(userId,{})
        testSet[userId].setdefault(itemId,float(rating))

    return trainSet,testSet,movieUser,item_in_train

#計算一個使用者的平均評分
def getAverageRating(user):
    average = (sum(trainSet[user].values()) * 1.0) / len(trainSet[user].keys())
    return average

#計算使用者相似度
def UserSimPearson(trainSet):
    userSim = {}
    for u1 in trainSet.keys():
        userSim.setdefault(u1,{})
        u1_rated = trainSet[u1].keys()
        for u2 in trainSet.keys():
            userSim[u1].setdefault(u2,0)
            if u1 != u2:
                u2_rated = trainSet[u2].keys()
                co_rated = list(set(u1_rated).intersection(set(u2_rated)))
                if co_rated == []:
                    userSim[u1][u2] = 0
                else:
                    num = 0     #皮爾遜計算公式的分子部分
                    den1 = 0    #皮爾遜計算公式的分母部分1
                    den2 = 0    #皮爾遜計算公式的分母部分2
                    sigma_u1_m = 0  #計算使用者u1對共同評價專案的評分均值
                    sigma_u2_m = 0  #計算使用者u2對共同評價專案的評分均值
                    for m in co_rated:
                        sigma_u1_m += trainSet[u1][m]
                        sigma_u2_m += trainSet[u2][m]
                    ave_u1_m = sigma_u1_m / len(co_rated)
                    ave_u2_m = sigma_u2_m / len(co_rated)

                    for m in co_rated:
                        num += (trainSet[u1][m] - ave_u1_m) * (trainSet[u2][m] - ave_u2_m) * 1.0
                        den1 += pow(trainSet[u1][m] - ave_u1_m, 2) * 1.0
                        den2 += pow(trainSet[u2][m] - ave_u2_m, 2) * 1.0
                    den1 = sqrt(den1)
                    den2 = sqrt(den2)
                    if den1 == 0 or den2 ==0 :
                        userSim[u1][u2] = 0
                    else:
                        userSim[u1][u2] = num / (den1 * den2)
            else:
                userSim[u1][u2] = 0
    return userSim

#對使用者相似度表進行排序處理
def sortSimMatrix(userSim):
    neighSorted = {}
    for u in userSim.keys():
        neigh_sorted = sorted(userSim[u].items(), key = lambda x:x[1], reverse = True)
        for key, value in neigh_sorted:
            neighSorted.setdefault(u,[])
            neighSorted[u].append(key)
    return neighSorted

#尋找使用者最近鄰並生成推薦結果;與測試集比較獲得演算法的準確度
def getAccuracyMetric(N,trainSet,testSet,movieUser,neighSorted, userSim, item_in_train):
    #尋找使用者最近鄰並生成推薦結果
    pred = {}
    for user, item in testSet.items():    #對測試集中的每個使用者
        pred.setdefault(user,{})    #生成使用者User的預測空列表
        ave_u_rating = getAverageRating(user)
        neigh_uninterseced = neighSorted[user] #獲取使用者user的鄰居集合(已按相似度大小降序排列)
        for m in item.keys():
            if m not in item_in_train:
                pred[user][m] = ave_u_rating
            else:
                rated_m_user = movieUser[m]         #測試集中評價過電影m的使用者
                neigh_intersected = sorted(rated_m_user,key = lambda x:neigh_uninterseced.index(x))
                if len(neigh_intersected) > N:
                    neigh = neigh_intersected[0:N]
                else:
                    neigh = neigh_intersected
                neighRating = 0
                neighSimSum = 0
                for neighUser in neigh:
                    neighRating += (trainSet[neighUser][m] - getAverageRating(neighUser)) * userSim[user][neighUser]
                    neighSimSum += abs(userSim[user][neighUser])
                if neighSimSum == 0:
                    pred[user][m] = ave_u_rating
                else:
                    pred[user][m] = ave_u_rating + (neighRating * 1.0) / neighSimSum

    #與測試集比較獲得演算法的準確度
    mae = 0
    rmse = 0
    error_sum = 0
    sqrError_sum = 0
    setSum = 0
    for user,item in pred.items():
        for m in item.keys():
            error_sum += abs(pred[user][m] - testSet[user][m])
            sqrError_sum += pow(pred[user][m] - testSet[user][m],2)
            setSum += 1
    mae = error_sum / setSum
    rmse = sqrt(sqrError_sum / setSum)
    return mae, rmse

if __name__ == '__main__':

    print '正在載入資料...'
    trainSet,testSet,movieUser,item_in_train = loadData()

    print '正在計算使用者間相似度...'
    userSim = UserSimPearson(trainSet)

    print '對相似度列表按相似度大小進行排列...'
    neighSorted = sortSimMatrix(userSim)

    print '正在尋找最近鄰...'
    NeighborSize = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    MAE = []
    RMSE = []
    for N in NeighborSize:            #對不同的近鄰數
        mae, rmse = getAccuracyMetric(N,trainSet,testSet,movieUser,neighSorted, userSim, item_in_train)   #獲得演算法推薦精度
        MAE.append(mae)
        RMSE.append(rmse)
    plt.subplot(211)
    plt.plot(NeighborSize,MAE)
    plt.xlabel('NeighborSize')
    plt.ylabel('Mean Absolute Error')
    plt.title('Pearson User Neighbor Model Collaborative Filtering')

    plt.subplot(212)
    plt.plot(NeighborSize,RMSE)
    plt.xlabel('NeighborSize')
    plt.ylabel('Root Mean Square Error')
    plt.title('Pearson User Neighbor Model Collaborative Filtering')

    plt.show()
    raw_input('按任意鍵繼續...')