1. 程式人生 > >Python推薦系統庫——Surprise

Python推薦系統庫——Surprise

@ 2018-01-24

Surprise

在推薦系統的建模過程中,我們將用到python庫 Surprise(Simple Python RecommendatIon System Engine),是scikit系列中的一個(很多同學用過scikit-learn和scikit-image等庫)。Surprise的User Guide有詳細的解釋和說明

簡單易用,同時支援多種推薦演算法:

演算法類名 說明
Algorithm predicting a random rating based on the distribution of the training set, which is assumed to be normal.
Algorithm predicting the baseline estimate for given user and item.
A basic collaborative filtering algorithm.
A basic collaborative filtering algorithm, taking into account the mean ratings of each user.
A basic collaborative filtering algorithm taking into account a baseline rating.
The famous SVD algorithm, as popularized by Simon Funk during the Netflix Prize.
The SVD++ algorithm, an extension of SVD taking into account implicit ratings.
A collaborative filtering algorithm based on Non-negative Matrix Factorization.
A simple yet accurate collaborative filtering algorithm.
A collaborative filtering algorithm based on co-clustering.

其中基於近鄰的方法(協同過濾)可以設定不同的度量準則。

相似度度量標準 度量標準說明
Compute the cosine similarity between all pairs of users (or items).
msd Compute the Mean Squared Difference similarity between all pairs of users (or items).
Compute the Pearson correlation coefficient between all pairs of users (or items).
Compute the (shrunk) Pearson correlation coefficient between all pairs of users (or items) using baselines for centering instead of means.

支援不同的評估準則

評估準則 準則說明
rmse Compute RMSE (Root Mean Squared Error).
mae Compute MAE (Mean Absolute Error).
fcp Compute FCP (Fraction of Concordant Pairs).

使用示例

基本使用方法如下

# 可以使用上面提到的各種推薦系統演算法
from surprise import SVD
from surprise import Dataset
from surprise import evaluate, print_perf

# 預設載入movielens資料集,會提示是否下載這個資料集,這是非常經典的公開推薦系統資料集——MovieLens資料集之一
data = Dataset.load_builtin('ml-100k')
# k折交叉驗證(k=3)
data.split(n_folds=3)
# 試一把SVD矩陣分解
algo = SVD()
# 在資料集上測試一下效果
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
#輸出結果
print_perf(perf)

載入自己的資料集方法

# 指定檔案所在路徑
file_path = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/u.data')
# 告訴文字閱讀器,文字的格式是怎麼樣的
reader = Reader(line_format='user item rating timestamp', sep='\t')
# 載入資料
data = Dataset.load_from_file(file_path, reader=reader)
# 手動切分成5折(方便交叉驗證)
data.split(n_folds=5)

演算法調參(讓推薦系統有更好的效果)

這裡實現的演算法用到的演算法無外乎也是SGD等,因此也有一些超引數會影響最後的結果,我們同樣可以用sklearn中常用到的網格搜尋交叉驗證(GridSearchCV)來選擇最優的引數。簡單的例子如下所示:

# 定義好需要優選的引數網格
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
# 使用網格搜尋交叉驗證
grid_search = GridSearch(SVD, param_grid, measures=['RMSE', 'FCP'])
# 在資料集上找到最好的引數
data = Dataset.load_builtin('ml-100k')
data.split(n_folds=3)
grid_search.evaluate(data)
# 輸出調優的引數組 
# 輸出最好的RMSE結果
print(grid_search.best_score['RMSE'])
# >>> 0.96117566386

# 輸出對應最好的RMSE結果的引數
print(grid_search.best_params['RMSE'])
# >>> {'reg_all': 0.4, 'lr_all': 0.005, 'n_epochs': 10}

# 最好的FCP得分
print(grid_search.best_score['FCP'])
# >>> 0.702279736531

# 對應最高FCP得分的引數
print(grid_search.best_params['FCP'])
# >>> {'reg_all': 0.6, 'lr_all': 0.005, 'n_epochs': 10}

在自己的資料集上訓練模型

首先載入資料

import os
from surprise import Reader, Dataset
# 指定檔案路徑
file_path = os.path.expanduser('./popular_music_suprise_format.txt')
# 指定檔案格式
reader = Reader(line_format='user item rating timestamp', sep=',')
# 從檔案讀取資料
music_data = Dataset.load_from_file(file_path, reader=reader)
# 分成5折
music_data.split(n_folds=5)

使用不同的推薦系統演算法進行建模比較

### 使用NormalPredictor
from surprise import NormalPredictor, evaluate
algo = NormalPredictor()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

### 使用BaselineOnly
from surprise import BaselineOnly, evaluate
algo = BaselineOnly()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

### 使用基礎版協同過濾
from surprise import KNNBasic, evaluate
algo = KNNBasic()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

### 使用均值協同過濾
from surprise import KNNWithMeans, evaluate
algo = KNNWithMeans()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

### 使用協同過濾baseline
from surprise import KNNBaseline, evaluate
algo = KNNBaseline()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

### 使用SVD
from surprise import SVD, evaluate
algo = SVD()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

### 使用SVD++
from surprise import SVDpp, evaluate
algo = SVDpp()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

### 使用NMF
from surprise import NMF
algo = NMF()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])
print_perf(perf)

建模和儲存模型

1.用協同過濾構建模型並進行預測

1.1 movielens的例子

# 可以使用上面提到的各種推薦系統演算法
from surprise import SVD
from surprise import Dataset
from surprise import evaluate, print_perf

# 預設載入movielens資料集
data = Dataset.load_builtin('ml-100k')
# k折交叉驗證(k=3)
data.split(n_folds=3)
# 試一把SVD矩陣分解
algo = SVD()
# 在資料集上測試一下效果
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
#輸出結果
print_perf(perf)

"""
以下的程式段告訴大家如何在協同過濾演算法建模以後,根據一個item取回相似度最高的item,主要是用到algo.get_neighbors()這個函式
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
import os
import io

from surprise import KNNBaseline
from surprise import Dataset


def read_item_names():
    """
    獲取電影名到電影id 和 電影id到電影名的對映
    """

    file_name = (os.path.expanduser('~') +
                 '/.surprise_data/ml-100k/ml-100k/u.item')
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid


# 首先,用演算法計算相互間的相似度
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.train(trainset)

# 獲取電影名到電影id 和 電影id到電影名的對映
rid_to_name, name_to_rid = read_item_names()

# Retieve inner id of the movie Toy Story
toy_story_raw_id = name_to_rid['Toy Story (1995)']
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)

# Retrieve inner ids of the nearest neighbors of Toy Story.
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)

# Convert inner ids of the neighbors into names.
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in toy_story_neighbors)
toy_story_neighbors = (rid_to_name[rid]
                       for rid in toy_story_neighbors)

print()
print('The 10 nearest neighbors of Toy Story are:')
for movie in toy_story_neighbors:
    print(movie)

1.2 音樂預測的例子

from __future__ import (absolute_import, division, print_function, unicode_literals)
import os
import io

from surprise import KNNBaseline
from surprise import Dataset

import cPickle as pickle
# 重建歌單id到歌單名的對映字典
id_name_dic = pickle.load(open("popular_playlist.pkl","rb"))
print("載入歌單id到歌單名的對映字典完成...")
# 重建歌單名到歌單id的對映字典
name_id_dic = {}
for playlist_id in id_name_dic:
    name_id_dic[id_name_dic[playlist_id]] = playlist_id
print("載入歌單名到歌單id的對映字典完成...")


file_path = os.path.expanduser('./popular_music_suprise_format.txt')
# 指定檔案格式
reader = Reader(line_format='user item rating timestamp', sep=',')
# 從檔案讀取資料
music_data = Dataset.load_from_file(file_path, reader=reader)
# 計算歌曲和歌曲之間的相似度
print("構建資料集...")
trainset = music_data.build_full_trainset()
#sim_options = {'name': 'pearson_baseline', 'user_based': False}
  • current_playlist => 歌單名
  • playlist_id => 歌單id(網易給的歌單id)
  • playlist_inner_id => 內部id(對所有歌單id重新從1開始編碼)
print("開始訓練模型...")
#sim_options = {'user_based': False}
#algo = KNNBaseline(sim_options=sim_options)
algo = KNNBaseline()
algo.train(trainset)

current_playlist = name_id_dic.keys()[39]
print(current_playlist)

# 取出近鄰
playlist_id = name_id_dic[current_playlist]
print(playlist_id)
playlist_inner_id = algo.trainset.to_inner_uid(playlist_id)
print(playlist_inner_id)

playlist_neighbors = algo.get_neighbors(playlist_inner_id, k=10)

# 把歌曲id轉成歌曲名字
playlist_neighbors = (algo.trainset.to_raw_uid(inner_id)
                       for inner_id in playlist_neighbors)
playlist_neighbors = (id_name_dic[playlist_id]
                       for playlist_id in playlist_neighbors)

print()
print("和歌單 《", current_playlist, "》 最接近的10個歌單為:\n")
for playlist in playlist_neighbors:
    print(playlist)

2.用SVD矩陣分解進行預測

### 使用SVD++
from surprise import SVDpp, evaluate
from surprise import Dataset

file_path = os.path.expanduser('./popular_music_suprise_format.txt')
# 指定檔案格式
reader = Reader(line_format='user item rating timestamp', sep=',')
# 從檔案讀取資料
music_data = Dataset.load_from_file(file_path, reader=reader)
# 構建資料集和建模
algo = SVDpp()
trainset = music_data.build_full_trainset()
algo.train(trainset)