1. 程式人生 > >電影推薦演算法例項程式碼

電影推薦演算法例項程式碼

# -*- coding=utf-8 -*-

import math
import sys
from texttable import Texttable

#   使用 |A&B|/sqrt(|A || B |)計算餘弦距離

def calcCosDistSpe(user1,user2):
    avg_x=0.0
    avg_y=0.0
    for key in user1:
        avg_x+=key[1]
    avg_x=avg_x/len(user1)

    for key in user2:
        avg_y+=key[1]
    avg_y=avg_y/len(user2)

    u1_u2=0.0
for key1 in user1: for key2 in user2: if key1[1] > avg_x and key2[1]>avg_y and key1[0]==key2[0]: u1_u2+=1 u1u2=len(user1)*len(user2)*1.0 sx_sy=u1_u2/math.sqrt(u1u2) return sx_sy # # 計算餘弦距離 def calcCosDist(user1,user2): sum_x=0.0 sum_y=0.0
sum_xy=0.0 for key1 in user1: for key2 in user2: if key1[0]==key2[0] : sum_xy+=key1[1]*key2[1] sum_y+=key2[1]*key2[1] sum_x+=key1[1]*key1[1] if sum_xy == 0.0 : return 0 sx_sy=math.sqrt(sum_x*sum_y) return sum_xy/sx_sy #
# 相似餘弦距離 def calcSimlaryCosDist(user1,user2): sum_x=0.0 sum_y=0.0 sum_xy=0.0 avg_x=0.0 avg_y=0.0 for key in user1: avg_x+=key[1] avg_x=avg_x/len(user1) for key in user2: avg_y+=key[1] avg_y=avg_y/len(user2) for key1 in user1: for key2 in user2: if key1[0]==key2[0] : sum_xy+=(key1[1]-avg_x)*(key2[1]-avg_y) sum_y+=(key2[1]-avg_y)*(key2[1]-avg_y) sum_x+=(key1[1]-avg_x)*(key1[1]-avg_x) if sum_xy == 0.0 : return 0 sx_sy=math.sqrt(sum_x*sum_y) return sum_xy/sx_sy # # 讀取檔案 def readFile(file_name): contents_lines=[] f=open(file_name,"r") contents_lines=f.readlines() f.close() return contents_lines # # 解壓rating資訊,格式:使用者id\t硬碟id\t使用者rating\t時間 # 輸入:資料集合 # 輸出:已經解壓的排名資訊 # def getRatingInformation(ratings): rates=[] for line in ratings: rate=line.split("\t") rates.append([int(rate[0]),int(rate[1]),int(rate[2])]) return rates # # 生成使用者評分的資料結構 # # 輸入:所以資料 [[2,1,5],[2,4,2]...] # 輸出:1.使用者打分字典 2.電影字典 # 使用字典,key是使用者id,value是使用者對電影的評價, # rate_dic[2]=[(1,5),(4,2)].... 表示使用者2對電影1的評分是5,對電影4的評分是2 def createUserRankDic(rates): user_rate_dic={} item_to_user={} for i in rates: user_rank=(i[1],i[2]) if i[0] in user_rate_dic: user_rate_dic[i[0]].append(user_rank) else: user_rate_dic[i[0]]=[user_rank] if i[1] in item_to_user: item_to_user[i[1]].append(i[0]) else: item_to_user[i[1]]=[i[0]] return user_rate_dic,item_to_user # # 計算與指定使用者最相近的鄰居 # 輸入:指定使用者ID,所以使用者資料,所以物品資料 # 輸出:與指定使用者最相鄰的鄰居列表 # def calcNearestNeighbor(userid,users_dic,item_dic): neighbors=[] #neighbors.append(userid) for item in users_dic[userid]: for neighbor in item_dic[item[0]]: if neighbor != userid and neighbor not in neighbors: neighbors.append(neighbor) neighbors_dist=[] for neighbor in neighbors: dist=calcSimlaryCosDist(users_dic[userid],users_dic[neighbor]) #calcSimlaryCosDist calcCosDist calcCosDistSpe neighbors_dist.append([dist,neighbor]) neighbors_dist.sort(reverse=True) #print neighbors_dist return neighbors_dist # # 使用UserFC進行推薦 # 輸入:檔名,使用者ID,鄰居數量 # 輸出:推薦的電影ID,輸入使用者的電影列表,電影對應使用者的反序表,鄰居列表 # def recommendByUserFC(file_name,userid,k=5): #讀取檔案資料 test_contents=readFile(file_name) #檔案資料格式化成二維陣列 List[[使用者id,電影id,電影評分]...] test_rates=getRatingInformation(test_contents) #格式化成字典資料 # 1.使用者字典:dic[使用者id]=[(電影id,電影評分)...] # 2.電影字典:dic[電影id]=[使用者id1,使用者id2...] test_dic,test_item_to_user=createUserRankDic(test_rates) #尋找鄰居 neighbors=calcNearestNeighbor(userid,test_dic,test_item_to_user)[:k] recommend_dic={} for neighbor in neighbors: neighbor_user_id=neighbor[1] movies=test_dic[neighbor_user_id] for movie in movies: #print movie if movie[0] not in recommend_dic: recommend_dic[movie[0]]=neighbor[0] else: recommend_dic[movie[0]]+=neighbor[0] #print len(recommend_dic) #建立推薦列表 recommend_list=[] for key in recommend_dic: #print key recommend_list.append([recommend_dic[key],key]) recommend_list.sort(reverse=True) #print recommend_list user_movies = [ i[0] for i in test_dic[userid]] return [i[1] for i in recommend_list],user_movies,test_item_to_user,neighbors # # 獲取電影的列表 def getMoviesList(file_name): #print sys.getdefaultencoding() movies_contents=readFile(file_name) movies_info={} for movie in movies_contents: movie_info=movie.split("|") movies_info[int(movie_info[0])]=movie_info[1:] return movies_info # #主程式 #輸入 : 測試資料集合 if __name__ == '__main__': reload(sys) sys.setdefaultencoding('utf-8') movies=getMoviesList("/Users/wuyinghao/Downloads/ml-100k/u.item") recommend_list,user_movie,items_movie,neighbors=recommendByUserFC("/Users/wuyinghao/Downloads/ml-100k/u.data",179,80) neighbors_id=[ i[1] for i in neighbors] table = Texttable() table.set_deco(Texttable.HEADER) table.set_cols_dtype(['t', # text 't', # float (decimal) 't']) # automatic table.set_cols_align(["l", "l", "l"]) rows=[] rows.append([u"movie name",u"release", u"from userid"]) for movie_id in recommend_list[:20]: from_user=[] for user_id in items_movie[movie_id]: if user_id in neighbors_id: from_user.append(user_id) rows.append([movies[movie_id][0],movies[movie_id][1],""]) table.add_rows(rows) print table.draw()
結果
movie name                release     
=======================================================
Contact (1997)                11-Jul-1997               
Scream (1996)                 20-Dec-1996               
Liar Liar (1997)              21-Mar-1997               
Saint, The (1997)             14-Mar-1997               
English Patient, The (1996)   15-Nov-1996               
Titanic (1997)                01-Jan-1997               
Air Force One (1997)          01-Jan-1997               
Star Wars (1977)              01-Jan-1977               
Conspiracy Theory (1997)      08-Aug-1997               
Toy Story (1995)              01-Jan-1995               
Fargo (1996)                  14-Feb-1997