1. 程式人生 > >【知識發現】隱語義模型LFM演算法python實現(二)

【知識發現】隱語義模型LFM演算法python實現(二)

http://blog.csdn.net/fjssharpsword/article/details/78015956

基於該篇文章中的程式碼優化,主要是在生成負樣例上提高執行速度,程式碼參考如下:

# -*- coding: utf-8 -*-
'''
Created on 2017年10月16日

@author: Administrator
'''
import numpy as np
import pandas as pd
from math import exp
import time
import math

class LFM:
    
    def __init__(self,lclass,iters,alpha,lamda,topk,ratio,traindata):
        self.lclass = lclass#隱類數量,對效能有影響
        self.iters = iters#迭代次數,收斂的最佳迭代次數未知
        self.alpha =alpha#梯度下降步長
        self.lamda = lamda#正則化引數
        self.topk =topk #推薦top k項
        self.ratio =ratio #正負樣例比率,對效能最大影響
        self.traindata=traindata
        
    #初始化開始.....    
    def getUserPositiveItem(self, userid):#生成正樣例
        traindata=self.traindata
        series = traindata[traindata['userid'] == userid]['itemid']
        positiveItemList = list(series.values)
        return positiveItemList

    def getUserNegativeItem(self, userid):#生成負樣例
        traindata=self.traindata
        itemLen=self.itemLen
        ratio=self.ratio
        userItemlist = list(set(traindata[traindata['userid'] == userid]['itemid'])) #使用者評分過的物品
        negativeItemList = []
        count =  ratio*len(userItemlist)#生成負樣例的數量
        for key,value in itemLen.iteritems():#itemLen.index
            if count==0:
                break
            if key in userItemlist:
                continue
            negativeItemList.append(key)
            count=count-1
        return negativeItemList    
    
    def initUserItem(self, userid):
        #traindata=self.traindata
        positiveItem = self.getUserPositiveItem( userid)
        negativeItem = self.getUserNegativeItem( userid)
        itemDict = {}
        for item in positiveItem: itemDict[item] = 1
        for item in negativeItem: itemDict[item] = 0
        return itemDict
    
    def initModel(self):
        traindata=self.traindata
        lcalss=self.lclass #隱類數量
        userID = list(set(traindata['userid'].values))
        self.userID=userID
        itemID = list(set(traindata['itemid'].values))
        self.itemID=itemID
        itemCount=[len(traindata[traindata['itemid'] == item]['userid']) for item in itemID ]
        self.itemLen = pd.Series(itemCount, index=itemID).sort_values(ascending=False)#統計每個物品對應的熱門度(次數並降序
        #初始化p、q矩陣
        arrayp = np.random.rand(len(userID), lcalss) #構造p矩陣,[0,1]內隨機值
        arrayq = np.random.rand(lcalss, len(itemID)) #構造q矩陣,[0,1]內隨機值
        p = pd.DataFrame(arrayp, columns=range(0,lcalss), index=userID)
        q = pd.DataFrame(arrayq, columns=itemID, index=range(0,lcalss))
        #生成負樣例
        userItem = []
        for userid in userID:
            itemDict = self.initUserItem(userid)
            userItem.append({userid:itemDict})
        return p, q, userItem
    #初始化結束.....  
    def sigmod(self,x):
        # 單位階躍函式,將興趣度限定在[0,1]範圍內
        y = 1.0/(1+exp(-x))
        return y
    
    def lfmPredict(self,p, q, userID, itemID):
        #利用引數p,q預測目標使用者對目標物品的興趣度
        p = np.mat(p.ix[userID].values)
        q = np.mat(q[itemID].values).T
        r = (p * q).sum()
        r = self.sigmod(r)
        return r
   
    def latenFactorModel(self):
        #traindata=self.traindata
        lclass=self.lclass
        iters=self.iters #迭代次數
        alpha = self.alpha #梯度下降步長
        lamda = self.lamda #正則化引數
        p, q, userItem = self.initModel()
        for step in range(0, iters):
            for user in userItem:
                for userID, samples in user.items():
                    for itemID, rui in samples.items():
                        eui = rui - self.lfmPredict(p, q, userID, itemID)
                        for f in range(0, lclass):
                            #print('step %d user %d class %d' % (step, userID, f))
                            p[f][userID] += alpha * (eui * q[itemID][f] - lamda * p[f][userID])
                            q[itemID][f] += alpha * (eui * p[f][userID] - lamda * q[itemID][f])
            alpha *= 0.9#學習速率
        return p, q
    
    def recommend(self,userid,p,q):
        itemID=self.itemID
        Topk=self.topk
        #traindata=self.traindata
        #userItemlist = list(set(traindata[traindata['userid'] == userid]['itemid']))
        #otherItemList = [item for item in set(traindata['itemid'].values) if item not in userItemlist]
        predictList = [self.lfmPredict(p, q, userid, itemid) for itemid in itemID]
        series = pd.Series(predictList, index=itemID)
        series = series.sort_values(ascending=False)[:Topk]
        return series
    
    def recallAndPrecision(self,p,q):#召回率和準確率
        traindata = self.traindata
        #itemID=self.itemID
        userID=self.userID
        hit = 0
        recall = 0
        precision = 0
        for userid in userID:
            trueItem = traindata[traindata['userid'] == userid]['itemid']
            preitem=self.recommend(userid, p, q)
            preItem=list(preitem.index)
            for item in preItem:
                if item in trueItem:
                    hit += 1
            recall += len(trueItem)
            precision += len(preItem)
        return (hit / (recall * 1.0),hit / (precision * 1.0))
    
    def coverage(self,p,q):#覆蓋率
        traindata = self.traindata
        recommend_items = set()
        all_items = set()
        userID=self.userID
        for userid in userID:
            trueItem = traindata[traindata['userid'] == userid]['itemid']
            for item in trueItem:
                all_items.add(item)
            preitem = self.recommend(userid, p, q)
            preItem=list(preitem.index)
            for item in preItem:
                recommend_items.add(item)
        return len(recommend_items) / (len(all_items) * 1.0)
    
    def popularity(self,p,q):#流行度
        #traindata = self.traindata
        itemLen=self.itemLen
        #itemID=self.itemID
        userID=self.userID
        ret = 0
        n = 0
        for userid in userID:
            preitem = self.recommend(userid, p, q)
            preItem=list(preitem.index)
            for item in preItem:
                ret += math.log(1+itemLen[item])
                n += 1
        return ret / (n * 1.0)
 
if __name__ == "__main__":   
    start = time.clock()  
    
    #匯入資料
    #df_sample = pd.read_csv("D:\\dev\\workspace\\PyRecSys\\demo\\ratings.csv",names=['userid','itemid','ratings'],header=0)
    df_sample = pd.read_csv("D:\\tmp\\ratings.csv",names=['userid','itemid','ratings'],header=0)
    traindata=df_sample[['userid','itemid']]  
    for ratio in [1,2,3,5,10,20]:
            for lclass in [5,10,20,30,50]:     
                lfm=LFM(lclass,2,0.02,0.01,10,ratio,traindata)  #隱類引數
                p,q=lfm.latenFactorModel()
                #推薦
                #preitem = lfm.recommend(1, p, q)
                #print (preitem)
                #模型評估
                print ("%3s%20s%20s%20s%20s%20s" % ('ratio','lcalss',"recall",'precision','coverage','popularity'))
                recall,precision = lfm.recallAndPrecision(p,q)
                coverage =lfm.coverage(p,q)
                popularity =lfm.popularity(p,q)
                print ("%3d%20d%19.3f%%%19.3f%%%19.3f%%%20.3f" % (ratio,lclass,recall * 100,precision * 100,coverage * 100,popularity))

    end = time.clock()    
    print('finish all in %s' % str(end - start))    

關注三點:
1)效能受正負樣例比率、隱類數量影響最大,要訓練出一個最佳引數。
2)對於梯度下降的收斂條件,即迭代次數,限定步長為0.02,迭代次數n要訓練出一個最佳值。
3)對於增量資料的訓練:儲存p、q矩陣,對於增量樣本集,可以在p、q基礎上訓練,有待實踐驗證,避免每次全量訓練耗費效能。