基於卷積神經網路電視節目推薦
阿新 • • 發佈:2018-12-21
本文使用文字卷積神經網路,並使用自己的電視節目資料集完成電影推薦的任務。
需要安裝TensorFlow1.0,Python3.5
文字卷積神經網路的圖如下,
圖片來自Kim Yoon的論文:Convolutional Neural Networks for Sentence Classification
# -*- coding: utf-8 -*- import pandas as pd from sklearn.model_selection import train_test_split import numpy as np from collections import Counter import tensorflow as tf import os import pickle from tensorflow.python.ops import math_ops from urllib.request import urlretrieve from os.path import isfile, isdir from tqdm import tqdm def load_data(): """ Load Dataset from File """ os.chdir('E:/廣電大資料營銷推薦專案案例/資料清洗/電視節目資訊資料預處理') # # #讀取User資料 users = pd.read_table('./wordsbag/dataprocess/data/week/mydata/data1_users.csv', sep=',', header='infer', engine = 'python') users_orig = users.values #讀取Movie資料集 # movies_title = ['MovieID', 'Title', 'Genres'] movies = pd.read_table('./wordsbag/dataprocess/data/week/mydata/data1_tv.csv', sep=',', header='infer', engine = 'python') movies = movies.filter(regex='program_id|program_title|genres_good') movies_orig = movies.values #電影型別轉數字字典 genres_set = set() for val in movies['genres_good'].str.split('/'): genres_set.update(val) genres_set.add('<PAD>') genres2int = {val:ii for ii, val in enumerate(genres_set)} #將電影型別轉成等長數字列表,長度是18 genres_count = 18 genres_map = {val:[genres2int[row] for row in val.split('/')] for ii,val in enumerate(set(movies['genres_good']))} for key in genres_map: for cnt in range(genres_count - len(genres_map[key])): genres_map[key].insert(len(genres_map[key]) + cnt,genres2int['<PAD>']) movies['genres_good'] = movies['genres_good'].map(genres_map) #電影Title轉數字字典 title_set = set() for val in movies['program_title'].str.split(): title_set.update(val) title_set.add('<PAD>') title2int = {val:ii for ii, val in enumerate(title_set)} #將電影Title轉成等長數字列表,長度是15 title_count = 15 title_map = {val:[title2int[row] for row in val.split()] for ii,val in enumerate(set(movies['program_title']))} for key in title_map: for cnt in range(title_count - len(title_map[key])): title_map[key].insert(len(title_map[key]) + cnt,title2int['<PAD>']) movies['program_title'] = movies['program_title'].map(title_map) #讀取評分資料集,評分為1-10分 ratings = pd.read_table('./wordsbag/dataprocess/data/week/mydata/data1_ratings.csv', sep=',', header='infer', engine = 'python') ratings = ratings.filter(regex='user_id|program_id|rating') #合併表 data = pd.merge(ratings,movies) #將資料分成X和y兩張表 target_fields = ['rating'] features_pd, targets_pd = data.drop(target_fields, axis=1), data[target_fields] features = features_pd.values targets_values = targets_pd.values return title_count, title_set, genres_count, genres2int, features, targets_values, ratings, users, movies, data, movies_orig,users_orig title_count, title_set, genres_count, genres2int, features, targets_values, ratings, users, movies, data, movies_orig,users_orig= load_data() pickle.dump((title_count, title_set, genres_count, genres2int, features, targets_values, ratings, movies, data, movies_orig,users_orig), open('./wordsbag/dataprocess/data/week/mydata/preprocess.p', 'wb')) # 預處理後 #users.head() movies.head() movies.values[0] title_count, title_set, genres_count, genres2int, features, targets_values, ratings, movies, data, movies_orig,users_orig = pickle.load(open('./wordsbag/dataprocess/data/week/mydata/preprocess.p', mode='rb')) import tensorflow as tf import os import pickle def save_params(params): """ Save parameters to file """ pickle.dump(params, open('./wordsbag/dataprocess/data/week/mydata/params.p', 'wb')) def load_params(): """ Load parameters from file """ return pickle.load(open('./wordsbag/dataprocess/data/week/mydata/params.p', mode='rb')) # 編碼實現 #嵌入矩陣的維度 embed_dim = 32 #使用者ID個數 uid_max = max(features.take(0,1)) + 1 # 1966+1=1967 print(uid_max) #電影ID個數 movie_id_max = max(features.take(1,1)) + 1 # 995+1 = 996 print(movie_id_max) #電影型別個數 movie_categories_max = max(genres2int.values()) + 1 # 104為什麼有這麼多重複的個數 print(movie_categories_max) #電影名單詞個數 movie_title_max = len(title_set) # 501+1=502 print(movie_title_max) #對電影型別嵌入向量做加和操作的標誌,考慮過使用mean做平均,但是沒實現mean combiner = "sum" #電影名長度 sentences_size = title_count # = 15 #文字卷積滑動視窗,分別滑動2, 3, 4, 5個單詞 window_sizes = {2, 3, 4, 5} #文字卷積核數量 filter_num = 8 #電影ID轉下標的字典,資料集中電影ID跟下標不一致,比如第5行的資料電影ID不一定是5 movieid2idx = {val[0]:i for i, val in enumerate(movies.values)} print(movieid2idx) # 超參 # Number of Epochs num_epochs = 5 # Batch Size batch_size = 256 dropout_keep = 0.5 # Learning Rate learning_rate = 0.0001 # Show stats for every n number of batches show_every_n_batches = 20 save_dir = './wordsbag/dataprocess/data/week/mydata/save2' # 輸入 def get_inputs(): uid = tf.placeholder(tf.int32, [None, 1], name="uid") movie_id = tf.placeholder(tf.int32, [None, 1], name="movie_id") movie_categories = tf.placeholder(tf.int32, [None, 18], name="movie_categories") movie_titles = tf.placeholder(tf.int32, [None, 15], name="movie_titles") targets = tf.placeholder(tf.int32, [None, 1], name="targets") LearningRate = tf.placeholder(tf.float32, name = "LearningRate") dropout_keep_prob = tf.placeholder(tf.float32, name = "dropout_keep_prob") return uid, movie_id, movie_categories, movie_titles, targets, LearningRate, dropout_keep_prob # 構建神經網路 def get_user_embedding(uid): with tf.name_scope("user_embedding"): uid_embed_matrix = tf.Variable(tf.random_uniform([uid_max, embed_dim], -1, 1), name = "uid_embed_matrix") uid_embed_layer = tf.nn.embedding_lookup(uid_embed_matrix, uid, name = "uid_embed_layer") return uid_embed_layer #將User的嵌入矩陣一起全連線生成User的特徵 def get_user_feature_layer(uid_embed_layer): with tf.name_scope("user_fc"): #第一層全連線 uid_fc_layer = tf.layers.dense(uid_embed_layer, embed_dim, name = "uid_fc_layer", activation=tf.nn.relu) #第二層全連線 user_combine_layer = tf.concat([uid_fc_layer], 2) #(?, 1, 128) user_combine_layer = tf.contrib.layers.fully_connected(user_combine_layer, 200, tf.tanh) #(?, 1, 200) user_combine_layer_flat = tf.reshape(user_combine_layer, [-1, 200]) return user_combine_layer, user_combine_layer_flat #定義Movie ID的嵌入矩陣 def get_movie_id_embed_layer(movie_id): with tf.name_scope("movie_embedding"): movie_id_embed_matrix = tf.Variable(tf.random_uniform([movie_id_max, embed_dim], -1, 1), name = "movie_id_embed_matrix") movie_id_embed_layer = tf.nn.embedding_lookup(movie_id_embed_matrix, movie_id, name = "movie_id_embed_layer") return movie_id_embed_layer #對電影型別的多個嵌入向量做加和 def get_movie_categories_layers(movie_categories): with tf.name_scope("movie_categories_layers"): movie_categories_embed_matrix = tf.Variable(tf.random_uniform([movie_categories_max, embed_dim], -1, 1), name = "movie_categories_embed_matrix") movie_categories_embed_layer = tf.nn.embedding_lookup(movie_categories_embed_matrix, movie_categories, name = "movie_categories_embed_layer") if combiner == "sum": movie_categories_embed_layer = tf.reduce_sum(movie_categories_embed_layer, axis=1, keep_dims=True) # elif combiner == "mean": return movie_categories_embed_layer # Movie Title的文字卷積網路實現 def get_movie_cnn_layer(movie_titles): #從嵌入矩陣中得到電影名對應的各個單詞的嵌入向量 with tf.name_scope("movie_embedding"): movie_title_embed_matrix = tf.Variable(tf.random_uniform([movie_title_max, embed_dim], -1, 1), name = "movie_title_embed_matrix") movie_title_embed_layer = tf.nn.embedding_lookup(movie_title_embed_matrix, movie_titles, name = "movie_title_embed_layer") movie_title_embed_layer_expand = tf.expand_dims(movie_title_embed_layer, -1) #對文字嵌入層使用不同尺寸的卷積核做卷積和最大池化 pool_layer_lst = [] for window_size in window_sizes: with tf.name_scope("movie_txt_conv_maxpool_{}".format(window_size)): filter_weights = tf.Variable(tf.truncated_normal([window_size, embed_dim, 1, filter_num],stddev=0.1),name = "filter_weights") filter_bias = tf.Variable(tf.constant(0.1, shape=[filter_num]), name="filter_bias") conv_layer = tf.nn.conv2d(movie_title_embed_layer_expand, filter_weights, [1,1,1,1], padding="VALID", name="conv_layer") relu_layer = tf.nn.relu(tf.nn.bias_add(conv_layer,filter_bias), name ="relu_layer") maxpool_layer = tf.nn.max_pool(relu_layer, [1,sentences_size - window_size + 1 ,1,1], [1,1,1,1], padding="VALID", name="maxpool_layer") pool_layer_lst.append(maxpool_layer) #Dropout層 with tf.name_scope("pool_dropout"): pool_layer = tf.concat(pool_layer_lst, 3, name ="pool_layer") max_num = len(window_sizes) * filter_num pool_layer_flat = tf.reshape(pool_layer , [-1, 1, max_num], name = "pool_layer_flat") dropout_layer = tf.nn.dropout(pool_layer_flat, dropout_keep_prob, name = "dropout_layer") return pool_layer_flat, dropout_layer # 將Movie的各個層一起做全連線 def get_movie_feature_layer(movie_id_embed_layer, movie_categories_embed_layer, dropout_layer): with tf.name_scope("movie_fc"): #第一層全連線 movie_id_fc_layer = tf.layers.dense(movie_id_embed_layer, embed_dim, name = "movie_id_fc_layer", activation=tf.nn.relu) movie_categories_fc_layer = tf.layers.dense(movie_categories_embed_layer, embed_dim, name = "movie_categories_fc_layer", activation=tf.nn.relu) #第二層全連線 movie_combine_layer = tf.concat([movie_id_fc_layer, movie_categories_fc_layer, dropout_layer], 2) #(?, 1, 96) movie_combine_layer = tf.contrib.layers.fully_connected(movie_combine_layer, 200, tf.tanh) #(?, 1, 200) movie_combine_layer_flat = tf.reshape(movie_combine_layer, [-1, 200]) return movie_combine_layer, movie_combine_layer_flat #構建計算圖 tf.reset_default_graph() train_graph = tf.Graph() with train_graph.as_default(): #獲取輸入佔位符 uid, movie_id, movie_categories, movie_titles, targets, lr, dropout_keep_prob = get_inputs() #獲取User的4個嵌入向量 uid_embed_layer = get_user_embedding(uid) #得到使用者特徵 user_combine_layer, user_combine_layer_flat = get_user_feature_layer(uid_embed_layer) #獲取電影ID的嵌入向量 movie_id_embed_layer = get_movie_id_embed_layer(movie_id) #獲取電影型別的嵌入向量 movie_categories_embed_layer = get_movie_categories_layers(movie_categories) #獲取電影名的特徵向量 pool_layer_flat, dropout_layer = get_movie_cnn_layer(movie_titles) #得到電影特徵 movie_combine_layer, movie_combine_layer_flat = get_movie_feature_layer(movie_id_embed_layer, movie_categories_embed_layer, dropout_layer) #計算出評分,要注意兩個不同的方案,inference的名字(name值)是不一樣的,後面做推薦時要根據name取得tensor with tf.name_scope("inference"): #將使用者特徵和電影特徵作為輸入,經過全連線,輸出一個值的方案 #簡單的將使用者特徵和電影特徵做矩陣乘法得到一個預測評分 inference = tf.reduce_sum(user_combine_layer_flat * movie_combine_layer_flat, axis=1) inference = tf.expand_dims(inference, axis=1) with tf.name_scope("loss"): # MSE損失,將計算值迴歸到評分 cost = tf.losses.mean_squared_error(targets, inference ) loss = tf.reduce_mean(cost) # 優化損失 # train_op = tf.train.AdamOptimizer(lr).minimize(loss) #cost global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(lr) gradients = optimizer.compute_gradients(loss) #cost train_op = optimizer.apply_gradients(gradients, global_step=global_step) # 取得batch def get_batches(Xs, ys, batch_size): for start in range(0, len(Xs), batch_size): end = min(start + batch_size, len(Xs)) yield Xs[start:end], ys[start:end] # 訓練網路 #%matplotlib inline #%config InlineBackend.figure_format = 'retina' import matplotlib.pyplot as plt import time import datetime losses = {'train':[], 'test':[]} with tf.Session(graph=train_graph) as sess: #蒐集資料給tensorBoard用 # Keep track of gradient values and sparsity grad_summaries = [] for g, v in gradients: if g is not None: grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name.replace(':', '_')), g) sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name.replace(':', '_')), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", loss) # Train Summaries train_summary_op = tf.summary.merge([loss_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) # Inference summaries inference_summary_op = tf.summary.merge([loss_summary]) inference_summary_dir = os.path.join(out_dir, "summaries", "inference") inference_summary_writer = tf.summary.FileWriter(inference_summary_dir, sess.graph) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() for epoch_i in range(num_epochs): #將資料集分成訓練集和測試集,隨機種子不固定 train_X,test_X, train_y, test_y = train_test_split(features, targets_values, test_size = 0.2, random_state = 0) train_batches = get_batches(train_X, train_y, batch_size) test_batches = get_batches(test_X, test_y, batch_size) #訓練的迭代,儲存訓練損失 for batch_i in range(len(train_X) // batch_size): x, y = next(train_batches) categories = np.zeros([batch_size, 18]) for i in range(batch_size): categories[i] = x.take(3,1)[i] # categories[i] = x.take(6,1)[i] titles = np.zeros([batch_size, sentences_size]) for i in range(batch_size): titles[i] = x.take(2,1)[i] feed = { uid: np.reshape(x.take(0,1), [batch_size, 1]), movie_id: np.reshape(x.take(1,1), [batch_size, 1]), movie_categories: categories, #x.take(3,1) movie_titles: titles, #x.take(2,1) targets: np.reshape(y, [batch_size, 1]), dropout_keep_prob: dropout_keep, #dropout_keep lr: learning_rate} step, train_loss, summaries, _ = sess.run([global_step, loss, train_summary_op, train_op], feed) #cost losses['train'].append(train_loss) train_summary_writer.add_summary(summaries, step) # # Show every <show_every_n_batches> batches if (epoch_i * (len(train_X) // batch_size) + batch_i) % show_every_n_batches == 0: time_str = datetime.datetime.now().isoformat() print('{}: Epoch {:>3} Batch {:>4}/{} train_loss = {:.3f}'.format( time_str, epoch_i, batch_i, (len(train_X) // batch_size), train_loss)) #使用測試資料的迭代 for batch_i in range(len(test_X) // batch_size): x, y = next(test_batches) categories = np.zeros([batch_size, 18]) for i in range(batch_size): categories[i] = x.take(3,1)[i] #x.take(3,1)中的3要根據自己的資料去做修改 titles = np.zeros([batch_size, sentences_size]) for i in range(batch_size): titles[i] = x.take(2,1)[i] #x.take(2,1)中的2要根據自己的資料去做修改 feed = { uid: np.reshape(x.take(0,1), [batch_size, 1]), movie_id: np.reshape(x.take(1,1), [batch_size, 1]), movie_categories: categories, #x.take(3,1) movie_titles: titles, #x.take(2,1) targets: np.reshape(y, [batch_size, 1]), dropout_keep_prob: 1, lr: learning_rate} step, test_loss, summaries = sess.run([global_step, loss, inference_summary_op], feed) #cost #儲存測試損失 losses['test'].append(test_loss) inference_summary_writer.add_summary(summaries, step) # time_str = datetime.datetime.now().isoformat() if (epoch_i * (len(test_X) // batch_size) + batch_i) % show_every_n_batches == 0: print('{}: Epoch {:>3} Batch {:>4}/{} test_loss = {:.3f}'.format( time_str, epoch_i, batch_i, (len(test_X) // batch_size), test_loss)) # Save Model saver.save(sess, save_dir) #, global_step=epoch_i print('Model Trained and Saved') # 儲存引數 save_params((save_dir)) load_dir = load_params() # 顯示訓練Loss plt.plot(losses['train'], label='Training loss') plt.legend() _ = plt.ylim() # 顯示測試Loss plt.plot(losses['test'], label='Test loss') plt.legend() _ = plt.ylim() # 獲取Tensors def get_tensors(loaded_graph): uid = loaded_graph.get_tensor_by_name("uid:0") movie_id = loaded_graph.get_tensor_by_name("movie_id:0") movie_categories = loaded_graph.get_tensor_by_name("movie_categories:0") movie_titles = loaded_graph.get_tensor_by_name("movie_titles:0") targets = loaded_graph.get_tensor_by_name("targets:0") dropout_keep_prob = loaded_graph.get_tensor_by_name("dropout_keep_prob:0") lr = loaded_graph.get_tensor_by_name("LearningRate:0") #兩種不同計算預測評分的方案使用不同的name獲取tensor inference # inference = loaded_graph.get_tensor_by_name("inference/inference/BiasAdd:0") inference = loaded_graph.get_tensor_by_name("inference/ExpandDims:0") # 之前是MatMul:0 因為inference程式碼修改了 這裡也要修改 感謝網友 @清歌 指出問題 movie_combine_layer_flat = loaded_graph.get_tensor_by_name("movie_fc/Reshape:0") user_combine_layer_flat = loaded_graph.get_tensor_by_name("user_fc/Reshape:0") return uid, movie_id, movie_categories, movie_titles, targets, lr, dropout_keep_prob, inference, movie_combine_layer_flat, user_combine_layer_flat #指定使用者和電影進行評分 def rating_movie(user_id_val, movie_id_val): loaded_graph = tf.Graph() # with tf.Session(graph=loaded_graph) as sess: # # Load saved model loader = tf.train.import_meta_graph(load_dir + '.meta') loader.restore(sess, load_dir) # Get Tensors from loaded model uid, movie_id, movie_categories, movie_titles, targets, lr, dropout_keep_prob, inference,_, __ = get_tensors(loaded_graph) #loaded_graph categories = np.zeros([1, 18]) categories[0] = movies.values[movieid2idx[movie_id_val]][2] titles = np.zeros([1, sentences_size]) titles[0] = movies.values[movieid2idx[movie_id_val]][1] feed = { uid: np.reshape(users.values[user_id_val-1][0], [1, 1]), movie_id: np.reshape(movies.values[movieid2idx[movie_id_val]][0], [1, 1]), movie_categories: categories, #x.take(6,1) movie_titles: titles, #x.take(5,1) dropout_keep_prob: 1} # Get Prediction inference_val = sess.run([inference], feed) return (inference_val) rating_movie(23, 1) #生成Movie特徵矩陣 loaded_graph = tf.Graph() # movie_matrics = [] with tf.Session(graph=loaded_graph) as sess: # # Load saved model loader = tf.train.import_meta_graph(load_dir + '.meta') loader.restore(sess, load_dir) # Get Tensors from loaded model uid, movie_id, movie_categories, movie_titles, targets, lr, dropout_keep_prob, _, movie_combine_layer_flat, __ = get_tensors(loaded_graph) #loaded_graph for item in movies.values: categories = np.zeros([1, 18]) categories[0] = item.take(2) titles = np.zeros([1, sentences_size]) titles[0] = item.take(1) feed = { movie_id: np.reshape(item.take(0), [1, 1]), movie_categories: categories, #x.take(3,1) movie_titles: titles, #x.take(2,1) dropout_keep_prob: 1} movie_combine_layer_flat_val = sess.run([movie_combine_layer_flat], feed) movie_matrics.append(movie_combine_layer_flat_val) pickle.dump((np.array(movie_matrics).reshape(-1, 200)), open('movie_matrics.p', 'wb')) movie_matrics = pickle.load(open('movie_matrics.p', mode='rb')) #生成User特徵矩陣 loaded_graph = tf.Graph() # users_matrics = [] with tf.Session(graph=loaded_graph) as sess: # # Load saved model loader = tf.train.import_meta_graph(load_dir + '.meta') loader.restore(sess, load_dir) # Get Tensors from loaded model uid, movie_id, movie_categories, movie_titles, targets, lr, dropout_keep_prob, _, __,user_combine_layer_flat = get_tensors(loaded_graph) #loaded_graph for item in users.values: feed = { uid: np.reshape(item.take(0), [1, 1]), dropout_keep_prob: 1} user_combine_layer_flat_val = sess.run([user_combine_layer_flat], feed) users_matrics.append(user_combine_layer_flat_val) pickle.dump((np.array(users_matrics).reshape(-1, 200)), open('./wordsbag/dataprocess/data/week/mydata/users_matrics.p', 'wb')) users_matrics = pickle.load(open('./wordsbag/dataprocess/data/week/mydata/users_matrics.p', mode='rb')) users_matrics = pickle.load(open('./wordsbag/dataprocess/data/week/mydata/users_matrics.p', mode='rb')) # # def recommend_same_type_movie(movie_id_val, top_k = 20): loaded_graph = tf.Graph() # with tf.Session(graph=loaded_graph) as sess: # # Load saved model loader = tf.train.import_meta_graph(load_dir + '.meta') loader.restore(sess, load_dir) norm_movie_matrics = tf.sqrt(tf.reduce_sum(tf.square(movie_matrics), 1, keep_dims=True)) normalized_movie_matrics = movie_matrics / norm_movie_matrics #推薦同類型的電影 probs_embeddings = (movie_matrics[movieid2idx[movie_id_val]]).reshape([1, 200]) probs_similarity = tf.matmul(probs_embeddings, tf.transpose(normalized_movie_matrics)) sim = (probs_similarity.eval()) # results = (-sim[0]).argsort()[0:top_k] # print(results) print("您看的電影是:{}".format(movies_orig[movieid2idx[movie_id_val]])) print("以下是給您的推薦:") p = np.squeeze(sim) p[np.argsort(p)[:-top_k]] = 0 p = p / np.sum(p) results = set() while len(results) != 10: c = np.random.choice(501, 1, p=p)[0] results.add(c) for val in (results): print(val) print(movies_orig[val]) return results recommend_same_type_movie(20, 10)
呼叫recommend_your_favorite_movie(使用者收看過的節目id,推薦個數N) ,可以輸入和使用者收看節目標籤類似的節目。
recommend_your_favorite_movie(2, 10)
您看的電影是:[20 '歡樂頌' '劇情 / 愛情'] 以下是給您的推薦: 257 [401 '生活啟示錄' '劇情 / 愛情'] 193 [275 '朋友圈' '劇情 / 愛情'] 262 [409 '平凡歲月' '劇情 / 愛情'] 426 [770 '公主夜遊記' '劇情 / 愛情'] 460 [867 '觀音山' '劇情 / 愛情'] 46 [75 '人間至味是清歡' '劇情 / 愛情'] 303 [498 '上海灘' '劇情 / 愛情'] 16 [20 '歡樂頌' '劇情 / 愛情'] 308 [505 '逆光飛翔' '劇情 / 愛情'] 86 [134 '向幸福出發' '劇情 / 愛情']