基於模型融合的推薦系統實現(1):基於SGD的PMF
阿新 • • 發佈:2018-10-31
(1)PMF演算法
PMF的基本的思路,就是定義兩個基本的引數W,U,然後對於任意一個組合(u,m),利用
Wi∗Uj
,來獲取預測值。這些基本的演算法思路網上很多,就不細說了。簡單說一下程式
[0]:一開始我們要將訓練資料劃分為3部分,第一部分用來做普通的SGD訓練,第二部分用來訓練模型融合,第三部分用來測試RMSE。
[1]:我們利用SGD(隨機梯度下降)來訓練函式,最後就可以得到W,U,為了更好的效果,還添加了偏置引數bu,bi,也要訓練得到
[2]:初始值問題,我們隨機生成引數,並且正比於
1/k√
.
[3]:最後是學習速率的變化,我簡單的每次都乘以0.9。但是這裡有好幾種辦法:
method1:
我們可以用啟發式的演算法來更新學習速率.當RMSE變大的時候就要減少速率,反之可以增大。
method2:
讓學習速率等於
α01+iter∗d
,d是一個常數用來條件減小的速度,隨著迭代的增加學習速率會越來越小。
[4]另外為了加快訓練的速度,避免每次訓練都要重頭開始,我將訓練的結果儲存在文本里面,每次重新讀取即可。所以這也增加了編寫程式的複雜度
[5]可以優化的地方:為了避免偏導數進入一個長期平滑的區域,我們可以引入動量這個引數,叫做momentum,每次偏導不光等於它的數學表示式,而是等於
partiald=partiald∗momentum+expression
,這樣可以加快收斂,在這裡我就沒有實現了。但是實現也比較簡單。
下面是程式碼,分割檔案的程式碼在最後給出
import numpy
from queue import PriorityQueue
from collections import Iterable,Counter,namedtuple,ChainMap,defaultdict
from functools import reduce
from itertools import groupby,chain,compress
from statistics import mean
from code import read_file
def get_train(path=r'smaller_train.txt'):
train = defaultdict(dict)
for uid,iid,r in read_file(path):
train[uid][iid] = r
return train
def write_file(data,path):
with open(path,'w') as f:
pass
with open(path,'a') as file:
for u_i,modelitems in data.items():
if isinstance(modelitems,Iterable):
file.write('{0} '.format(u_i)+' '.join(('{0:.2f}'.format(x) for x in modelitems))+'\n')
else:
file.write('{0} '.format(u_i)+'{0:.3f}'.format(modelitems)+'\n')
def LFM(train,F,N,alpha,_lambda):
(p,q,bu,bi) = init(train,F)
for step in range(0,N):
print(step)
print(bu[1],p[1][1])
for u,user_items in train.items():
pu = p[u]
for i,r in user_items.items():
pui = predict(u,i,p,q,bu,bi)
eui = r - pui
bu[u] = alpha*(eui-_lambda*bu[u])
bi[i] = alpha*(eui-_lambda*bi[i])
qi = q[i]
for f in range(F):
pu[f] += alpha*(qi[f]*eui - _lambda*pu[f])
qi[f] += alpha*(pu[f]*eui - _lambda*qi[f])
alpha *= 0.9
write_file(p,'p{}.txt'.format(F))
write_file(q,'q{}.txt'.format(F))
write_file(bu,'bu{}.txt'.format(F))
write_file(bi,'bi{}.txt'.format(F))
return bu,bi,p,q
def predict(u,i,p,q,bu,bi):
try:
pu,qi,bu_,bi_= p[u],q[i],bu[u],bi[i]
except:
return None
return sum(pu[f]*qi[f] for f in range(len(pu)))+bu_+bi_
def init(train,F):
import random
p,q = dict(),dict()
bu,bi = dict(),dict()
for u in train:
p[u] = [random.random()/(F)**(0.5) for x in range(F)]
bu[u] = 0
for i in train[u]:
if i not in q:
bi[i] = 0
q[i] = [random.random()/(F)**(0.5) for x in range(F)]
return p,q,bu,bi
def get_pq(sep = '\t',index = 5):
import re
p,q,bu,bi = dict(),dict(),dict(),dict()
p_name,q_name,bu_name,bi_name = 'p{0}.txt'.format(index),'q{0}.txt'.format(index),'bu{0}.txt'.format(index),'bi{0}.txt'.format(index)
name = [p_name,q_name,bu_name,bi_name]
for x,_name in zip([p,q,bu,bi],name):
with open(_name) as p_f:
for line in p_f:
line = re.split(r'[;,\s\t\n]\s*',line)
line = [x for x in line if x!='']
uid = int(line[0])
x[uid] = list(map(float,line[1:])) if len(line)!=2 else float(line[1])
return p,q,bu,bi
def REMS(p,q,bu,bi):
error = 0
cnt = 0
mmin = 10;mmax = 0;
for uid,iid,r in read_file(r'smaller_test.txt'):
pr = predict(uid,iid,p,q,bu,bi)
if pr==None:continue
if pr<1:pr = 1.0
if pr>5:pr=5.0
error += (r-pr)**2
cnt += 1
print(cnt,error,(error/cnt)**0.5)
if __name__ == '__main__':
k = 50
LFM(get_train(),k,50,0.02,0.01)
p,q,bu,bi = get_pq(index=k)
REMS(p,q,bu,bi)
用來分隔檔案,讀取檔案
import numpy
from queue import PriorityQueue
from collections import Iterable,Counter,namedtuple,ChainMap,defaultdict
from functools import reduce
from itertools import groupby,chain,compress
from statistics import mean
import re
def read_file(r_path,sep='\t',num = 3):
with open(r_path) as file:
for line in file:
line = re.split(r'[;,\s\t\n]\s*',line)
line = [x for x in line if x!='']
uid,iid = int(line[0]),int(line[1])
if num==2:
yield uid,iid#return uid,item id in test file
else:
yield uid,iid,float(line[2])#return uid,item id,rating
def write_file(w_path,data):
with open(w_path,'w'):
pass
with open(w_path,'a') as file:
for u in data:
user = data[u]
for iid,r in user.items():
file.write('{0}\t{1}\t{2}\n'.format(u,iid,r))
def split():
index = 0
M = 10
train = defaultdict(dict)
test = defaultdict(dict)
for uid,iid,r in read_file(r'real_train.txt'):
index+=1
if index%M == 0:
test[uid][iid]=r
else:
train[uid][iid] = r
w_path1,w_path2 = r'real_train.txt',r'real_test.txt'
w_path3,w_path4 = r'smaller_train.txt',r'smaller_test.txt'
write_file(w_path3,train)
write_file(w_path4,test)
#write_file(w_path1,train)
#write_file(w_path2,test)
if __name__ == '__main__':
pass