使用自然語言處理構建基於內容的推薦系統
阿新 • • 發佈:2018-11-27
資料下載地址:https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7 1.提取資料---電影標題,電影型別,電影導演,電影演員,電影劇情 2.清洗資料--- 電影劇情使用rake_nltk去除停定詞,對關鍵詞排序。 電影導演,電影演員去除空格,把姓和名作為一個單詞 3.把所有的關鍵詞拼接成bag_of_words,計算相似度。 4.對指定電影進行top10推薦。 主要的技術點:rate_nltk,sklean中的cosine_similarity,skean中的CountVectorizer
#!/usr/bin/Python # -*- coding: utf-8 -* import pandas as pd from rake_nltk import Rake import numpy as np from sklearn.metrics.pairwise import cosine_similarity from sklearn.feature_extraction.text import CountVectorizer pd.set_option('display.max_columns', 100) df = pd.read_csv('IMDB_Top250Engmovies2_OMDB_Detailed.csv') print(df.head()) print(df.shape) df = df[['Title','Genre','Director','Actors','Plot']] df.head() # discarding the commas between the actors' full names and getting only the first three names df['Actors'] = df['Actors'].map(lambda x: x.split(',')[:3]) print("====演員列表====") print(df['Actors'][:3]) # putting the genres in a list of words df['Genre'] = df['Genre'].map(lambda x: x.lower().split(',')) print("===型別列表===") print(df['Genre'][:3]) df['Director'] = df['Director'].map(lambda x: x.split(' ')) print("===導演列表====") print(df['Director'][:3]) # merging together first and last name for each actor and director, so it's considered as one word # and there is no mix up between people sharing a first name for index, row in df.iterrows(): # 把姓和名變成一個單詞 row['Actors'] = [x.lower().replace(' ','') for x in row['Actors']] # print("變化前") # print(row['Director']) row['Director'] = ''.join(row['Director']).lower() # print("變化後") # print(row['Director']) for index, row in df.iterrows(): if(index<3): print("===演員列表===") print(row['Actors']) print("===導演列表===") print(row['Director']) # initializing the new column df['Key_words'] = "" for index, row in df.iterrows(): plot = row['Plot'] # instantiating Rake, by default is uses english stopwords from NLTK # and discard all puntuation characters r = Rake() # extracting the words by passing the text r.extract_keywords_from_text(plot) # getting the dictionary whith key words and their scores key_words_dict_scores = r.get_word_degrees() print("===key_words_dict_scores===") print(key_words_dict_scores) # assigning the key words to the new column row['Key_words'] = list(key_words_dict_scores.keys()) print("===key_words===") print(row['Key_words']) # dropping the Plot column df.drop(columns=['Plot'], inplace=True) df.set_index('Title', inplace = True) print(df.head()) df['bag_of_words'] = '' print("===df.columns====") print(df.columns) columns = df.columns for index, row in df.iterrows(): words = '' for col in columns: if col != 'Director': words = words + ' '.join(row[col]) + ' ' else: words = words + row[col] + ' ' print("====words:====") print(words) row['bag_of_words'] = words for index, row in df.iterrows(): print("===bag_of_words===") print(row['bag_of_words']) df.drop(columns=[col for col in df.columns if col != 'bag_of_words'], inplace=True) print("===head:===") print(df.head()) # instantiating and generating the count matrix count = CountVectorizer() count_matrix = count.fit_transform(df['bag_of_words']) # creating a Series for the movie titles so they are associated to an ordered numerical # list I will use later to match the indexes indices = pd.Series(df.index) indices[:5] print("===indices:===") print(indices) # generating the cosine similarity matrix cosine_sim = cosine_similarity(count_matrix, count_matrix) # cosine_sim print("===cosine_sim:===") print(cosine_sim) # function that takes in movie title as input and returns the top 10 recommended movies def recommendations(title, cosine_sim=cosine_sim): recommended_movies = [] # gettin the index of the movie that matches the title idx = indices[indices == title].index[0] print("===idx:====") print(idx) # creating a Series with the similarity scores in descending order score_series = pd.Series(cosine_sim[idx]).sort_values(ascending=False) print("===score_series===") print(score_series) # getting the indexes of the 10 most similar movies top_10_indexes = list(score_series.iloc[1:11].index) # populating the list with the titles of the best 10 matching movies for i in top_10_indexes: recommended_movies.append(list(df.index)[i]) return recommended_movies print(recommendations('Fargo'))