1. 程式人生 > >在MovieLens資料集上用SVD進行評分預測

在MovieLens資料集上用SVD進行評分預測

參考了Yehuda Koren 08年的論文Factorization Meets the Neighborhood: a Multifaceted Collaborative Filtering Model

程式碼如下:

'''
Version:1.0
Created on 2014-02-25
@Author:Dior
'''

import random
import math
import cPickle as pickle

class SVD():
    def __init__(self,allfile,trainfile,testfile,factorNum=10):
        #all data file
        self.allfile=allfile
        #training set file
        self.trainfile=trainfile
        #testing set file
        self.testfile=testfile
        #get factor number
        self.factorNum=factorNum
        #get user number
        self.userNum=self.getUserNum()
        #get item number
        self.itemNum=self.getItemNum()
        #learning rate
        self.learningRate=0.01
        #the regularization lambda
        self.regularization=0.05
        #initialize the model and parameters
        self.initModel()
    #get user number function
    def getUserNum(self):
        file=self.allfile
        cnt=0
        userSet=set()
        for line in open(file):
            user=line.split('\t')[0].strip()
            if user not in userSet:
                userSet.add(user)
                cnt+=1
        return cnt
    #get item number function
    def getItemNum(self):
        file=self.allfile
        cnt=0
        itemSet=set()
        for line in open(file):
            item=line.split('\t')[1].strip()
            if item not in itemSet:
                itemSet.add(item)
                cnt+=1
        return cnt
    #initialize all parameters
    def initModel(self):
        self.av=self.average(self.trainfile)
        self.bu=[0.0 for i in range(self.userNum)]
        self.bi=[0.0 for i in range(self.itemNum)]
        temp=math.sqrt(self.factorNum)
        self.pu=[[(0.1*random.random()/temp) for i in range(self.factorNum)] for j in range(self.userNum)]
        self.qi=[[0.1*random.random()/temp for i in range(self.factorNum)] for j in range(self.itemNum)]
        print "Initialize end.The user number is:%d,item number is:%d,the average score is:%f" % (self.userNum,self.itemNum,self.av)
     #train model  
    def train(self,iterTimes=100):
        print "Beginning to train the model......"
        trainfile=self.trainfile
        preRmse=10000.0
        for iter in range(iterTimes):
            fi=open(trainfile,'r')
            #read the training file
            for line in fi:
                content=line.split('\t')
                user=int(content[0].strip())-1
                item=int(content[1].strip())-1
                rating=float(content[2].strip())
                #calculate the predict score
                pscore=self.predictScore(self.av,self.bu[user],self.bi[item],self.pu[user],self.qi[item])
                #the delta between the real score and the predict score
                eui=rating-pscore
                
                #update parameters bu and bi(user rating bais and item rating bais)
                self.bu[user]+=self.learningRate*(eui-self.regularization*self.bu[user])
                self.bi[item]+=self.learningRate*(eui-self.regularization*self.bi[item])
                for k in range(self.factorNum):
                    temp=self.pu[user][k]
                    #update pu,qi
                    self.pu[user][k]+=self.learningRate*(eui*self.qi[user][k]-self.regularization*self.pu[user][k])
                    self.qi[item][k]+=self.learningRate*(temp*eui-self.regularization*self.qi[item][k])
                #print pscore,eui
            #close the file
            fi.close()
            #calculate the current rmse
            curRmse=self.test(self.av,self.bu,self.bi,self.pu,self.qi)
            print "Iteration %d times,RMSE is : %f" % (iter+1,curRmse)
            if curRmse>preRmse:
                break
            else:
                preRmse=curRmse
        print "Iteration finished!"
    #test on the test set and calculate the RMSE
    def test(self,av,bu,bi,pu,qi):
        testfile=self.testfile
        rmse=0.0
        cnt=0
        fi=open(testfile)
        for line in fi:
            cnt+=1
            content=line.split('\t')
            user=int(content[0].strip())-1
            item=int(content[1].strip())-1
            score=float(content[2].strip())
            pscore=self.predictScore(av,bu[user],bi[item],pu[user],qi[item])
            rmse+=math.pow(score-pscore,2)
        fi.close()
        return math.sqrt(rmse/cnt)
    #calculate the average rating in the training set
    def average(self,filename):
        result=0.0
        cnt=0
        for line in open(filename):
            cnt+=1
            score=float(line.split('\t')[2].strip())
            result+=score
        return result/cnt
    #calculate the inner product of two vectors
    def innerProduct(self,v1,v2):
        result=0.0
        for i in range(len(v1)):
            result+=v1[i]*v2[i]
        return result
    def predictScore(self,av,bu,bi,pu,qi):
        pscore=av+bu+bi+self.innerProduct(pu,qi)
        if pscore<1:
            pscore=1
        if pscore>5:
            pscore=5
        return pscore
    
if __name__=='__main__':
    s=SVD("data\\u.data","data\\ua.base","data\\ua.test")
    #print s.userNum,s.itemNum
    #print s.average("data\\ua.base")
    s.train()

    

實驗結果如下:

Initialize end.The user number is:943,item number is:1682,the average score is:3.523827
Beginning to train the model......
Iteration 1 times,RMSE is : 1.002799
Iteration 2 times,RMSE is : 0.982096
Iteration 3 times,RMSE is : 0.972882
Iteration 4 times,RMSE is : 0.967720
Iteration 5 times,RMSE is : 0.964554
Iteration 6 times,RMSE is : 0.962498
Iteration 7 times,RMSE is : 0.961116
Iteration 8 times,RMSE is : 0.960166
Iteration 9 times,RMSE is : 0.959482
Iteration 10 times,RMSE is : 0.958933
Iteration 11 times,RMSE is : 0.958416
Iteration 12 times,RMSE is : 0.957814
Iteration 13 times,RMSE is : 0.956986
Iteration 14 times,RMSE is : 0.955798
Iteration 15 times,RMSE is : 0.954165
Iteration 16 times,RMSE is : 0.952135
Iteration 17 times,RMSE is : 0.949907
Iteration 18 times,RMSE is : 0.947718
Iteration 19 times,RMSE is : 0.945695
Iteration 20 times,RMSE is : 0.943901
Iteration 21 times,RMSE is : 0.942296
Iteration 22 times,RMSE is : 0.940793
Iteration 23 times,RMSE is : 0.939336
Iteration 24 times,RMSE is : 0.937880
Iteration 25 times,RMSE is : 0.936398
Iteration 26 times,RMSE is : 0.934883
Iteration 27 times,RMSE is : 0.933353
Iteration 28 times,RMSE is : 0.931833
Iteration 29 times,RMSE is : 0.930368
Iteration 30 times,RMSE is : 0.928991
Iteration 31 times,RMSE is : 0.927724
Iteration 32 times,RMSE is : 0.926570
Iteration 33 times,RMSE is : 0.925547
Iteration 34 times,RMSE is : 0.924663
Iteration 35 times,RMSE is : 0.923920
Iteration 36 times,RMSE is : 0.923318
Iteration 37 times,RMSE is : 0.922853
Iteration 38 times,RMSE is : 0.922525
Iteration 39 times,RMSE is : 0.922330
Iteration 40 times,RMSE is : 0.922255
Iteration 41 times,RMSE is : 0.922297
Iteration finished!

最後的RMSE是0.922左右,可見效果一般。後面會嘗試改進。