1. 程式人生 > >基於模型融合的推薦系統實現(1):基於SGD的PMF

基於模型融合的推薦系統實現(1):基於SGD的PMF

(1)PMF演算法

PMF的基本的思路,就是定義兩個基本的引數W,U,然後對於任意一個組合(u,m),利用 WiUj ,來獲取預測值。這些基本的演算法思路網上很多,就不細說了。簡單說一下程式

[0]:一開始我們要將訓練資料劃分為3部分,第一部分用來做普通的SGD訓練,第二部分用來訓練模型融合,第三部分用來測試RMSE。

[1]:我們利用SGD(隨機梯度下降)來訓練函式,最後就可以得到W,U,為了更好的效果,還添加了偏置引數bu,bi,也要訓練得到

[2]:初始值問題,我們隨機生成引數,並且正比於 1/k
.

[3]:最後是學習速率的變化,我簡單的每次都乘以0.9。但是這裡有好幾種辦法:

method1: 我們可以用啟發式的演算法來更新學習速率.當RMSE變大的時候就要減少速率,反之可以增大。

method2: 讓學習速率等於 α01+iterd ,d是一個常數用來條件減小的速度,隨著迭代的增加學習速率會越來越小。

[4]另外為了加快訓練的速度,避免每次訓練都要重頭開始,我將訓練的結果儲存在文本里面,每次重新讀取即可。所以這也增加了編寫程式的複雜度

[5]可以優化的地方:為了避免偏導數進入一個長期平滑的區域,我們可以引入動量這個引數,叫做momentum,每次偏導不光等於它的數學表示式,而是等於 partiald=partialdmomentum+expression ,這樣可以加快收斂,在這裡我就沒有實現了。但是實現也比較簡單。

下面是程式碼,分割檔案的程式碼在最後給出

import numpy
from queue import PriorityQueue
from collections import Iterable,Counter,namedtuple,ChainMap,defaultdict
from
functools import reduce from itertools import groupby,chain,compress from statistics import mean from code import read_file def get_train(path=r'smaller_train.txt'): train = defaultdict(dict) for uid,iid,r in read_file(path): train[uid][iid] = r return train def write_file(data,path): with open(path,'w') as f: pass with open(path,'a') as file: for u_i,modelitems in data.items(): if isinstance(modelitems,Iterable): file.write('{0} '.format(u_i)+' '.join(('{0:.2f}'.format(x) for x in modelitems))+'\n') else: file.write('{0} '.format(u_i)+'{0:.3f}'.format(modelitems)+'\n') def LFM(train,F,N,alpha,_lambda): (p,q,bu,bi) = init(train,F) for step in range(0,N): print(step) print(bu[1],p[1][1]) for u,user_items in train.items(): pu = p[u] for i,r in user_items.items(): pui = predict(u,i,p,q,bu,bi) eui = r - pui bu[u] = alpha*(eui-_lambda*bu[u]) bi[i] = alpha*(eui-_lambda*bi[i]) qi = q[i] for f in range(F): pu[f] += alpha*(qi[f]*eui - _lambda*pu[f]) qi[f] += alpha*(pu[f]*eui - _lambda*qi[f]) alpha *= 0.9 write_file(p,'p{}.txt'.format(F)) write_file(q,'q{}.txt'.format(F)) write_file(bu,'bu{}.txt'.format(F)) write_file(bi,'bi{}.txt'.format(F)) return bu,bi,p,q def predict(u,i,p,q,bu,bi): try: pu,qi,bu_,bi_= p[u],q[i],bu[u],bi[i] except: return None return sum(pu[f]*qi[f] for f in range(len(pu)))+bu_+bi_ def init(train,F): import random p,q = dict(),dict() bu,bi = dict(),dict() for u in train: p[u] = [random.random()/(F)**(0.5) for x in range(F)] bu[u] = 0 for i in train[u]: if i not in q: bi[i] = 0 q[i] = [random.random()/(F)**(0.5) for x in range(F)] return p,q,bu,bi def get_pq(sep = '\t',index = 5): import re p,q,bu,bi = dict(),dict(),dict(),dict() p_name,q_name,bu_name,bi_name = 'p{0}.txt'.format(index),'q{0}.txt'.format(index),'bu{0}.txt'.format(index),'bi{0}.txt'.format(index) name = [p_name,q_name,bu_name,bi_name] for x,_name in zip([p,q,bu,bi],name): with open(_name) as p_f: for line in p_f: line = re.split(r'[;,\s\t\n]\s*',line) line = [x for x in line if x!=''] uid = int(line[0]) x[uid] = list(map(float,line[1:])) if len(line)!=2 else float(line[1]) return p,q,bu,bi def REMS(p,q,bu,bi): error = 0 cnt = 0 mmin = 10;mmax = 0; for uid,iid,r in read_file(r'smaller_test.txt'): pr = predict(uid,iid,p,q,bu,bi) if pr==None:continue if pr<1:pr = 1.0 if pr>5:pr=5.0 error += (r-pr)**2 cnt += 1 print(cnt,error,(error/cnt)**0.5) if __name__ == '__main__': k = 50 LFM(get_train(),k,50,0.02,0.01) p,q,bu,bi = get_pq(index=k) REMS(p,q,bu,bi)

用來分隔檔案,讀取檔案

import numpy
from queue import PriorityQueue
from collections import Iterable,Counter,namedtuple,ChainMap,defaultdict
from functools import reduce
from itertools import groupby,chain,compress
from statistics import mean
import re

def read_file(r_path,sep='\t',num = 3):
    with open(r_path) as file:
        for line in file:
            line = re.split(r'[;,\s\t\n]\s*',line)
            line = [x for x in line if x!='']
            uid,iid = int(line[0]),int(line[1])
            if num==2:
                yield uid,iid#return uid,item id in test file
            else:
                yield uid,iid,float(line[2])#return uid,item id,rating

def write_file(w_path,data):

    with open(w_path,'w'):
        pass
    with open(w_path,'a') as file:
        for u in data:
            user = data[u]
            for iid,r in user.items():
                file.write('{0}\t{1}\t{2}\n'.format(u,iid,r))
def split():
    index = 0
    M = 10
    train = defaultdict(dict)
    test = defaultdict(dict)
    for uid,iid,r in read_file(r'real_train.txt'):
        index+=1
        if index%M == 0:
            test[uid][iid]=r
        else:
            train[uid][iid] = r

    w_path1,w_path2 = r'real_train.txt',r'real_test.txt' 
    w_path3,w_path4 = r'smaller_train.txt',r'smaller_test.txt'
    write_file(w_path3,train)
    write_file(w_path4,test)
    #write_file(w_path1,train)
    #write_file(w_path2,test)
if __name__ == '__main__':
    pass

大概7M的資料RMSE在0.88左右,應該還有進一步優化的空間