電商產品評論的資料情感分析python程式碼實現
阿新 • • 發佈:2018-12-13
步驟1:從爬取的資料中提取對應的評論資訊
#-*- coding: utf-8 -*- import pandas as pd inputfile = '.../huizong.csv' #評論彙總檔案 outputfile = '.../meidi_jd.txt' #評論提取後儲存路徑 data = pd.read_csv(inputfile, encoding = 'utf-8') data = data[[u'評論']][data[u'品牌'] == u'美的'] data.to_csv(outputfile, index = False, header = False, encoding = 'utf-8')
步驟2:刪除評論資訊中重複的評論
#-*- coding: utf-8 -*- import pandas as pd inputfile = '.../meidi_jd.txt' #評論檔案 outputfile = '.../meidi_jd_process_1.txt' #評論處理後儲存路徑 data = pd.read_csv(inputfile, encoding = 'utf-8', header = None) l1 = len(data) data = pd.DataFrame(data[0].unique()) l2 = len(data) data.to_csv(outputfile, index = False, header = False, encoding = 'utf-8') print(u'刪除了%s條評論。' %(l1 - l2))
步驟3:刪除評論的字首中相同的詞
#-*- coding: utf-8 -*- import codecs inputfile = '.../meidi_jd_process_1.txt' #評論檔案 outputfile = '.../meidi_jd_process_2.txt' #評論處理後儲存路徑 f = codecs.open(inputfile ,'r','utf-8') f1=codecs.open(outputfile,'w','utf-8') fileList = f.readlines() f.close() for A_string in fileList: temp1= A_string.strip('\n') #去掉每行最後的換行符'\n' temp2 = temp1.lstrip('\ufeff') temp3= temp2.strip('\r') char_list=list( temp3) list1=[''] list2=[''] del1=[] flag=[''] i=0 while(i<len(char_list)): if (char_list[i]==list1[0]): if (list2==['']): list2[0]=char_list[i] else: if (list1==list2): t=len(list1) m=0 while(m<t): del1.append( i-m-1) m=m+1 list2=[''] list2[0]=char_list[i] else: list1=[''] list2=[''] flag=[''] list1[0]=char_list[i] flag[0]=i else: if (list1==list2)and(list1!=[''])and(list2!=['']): if len(list1)>=2: t=len(list1) m=0 while(m<t): del1.append( i-m-1) m=m+1 list1=[''] list2=[''] list1[0]=char_list[i] flag[0]=i else: if(list2==['']): if(list1==['']): list1[0]=char_list[i] flag[0]=i else: list1.append(char_list[i]) flag.append(i) else: list2.append(char_list[i]) i=i+1 if(i==len(char_list)): if(list1==list2): t=len(list1) m=0 while(m<t): del1.append( i-m-1) m=m+1 m=0 while(m<t): del1.append(flag[m]) m=m+1 a=sorted(del1) t=len(a)-1 while (t>=0): #print(char_list[a[t]]) del char_list[a[t]] t=t-1 str1 = "".join(char_list) str2=str1.strip() #刪除兩邊空格 f1.writelines(str2+'\r\n') f1.close()
步驟4:將得到的每個句子進行反轉
#-*- coding: utf-8 -*-
import pandas as pd
inputfile = '.../meidi_jd_process_2.txt' #評論檔案
outputfile = '.../meidi_jd_process_3.txt' #句子倒序
data = pd.read_csv(inputfile, encoding = 'utf-8', header = None)
data = pd.DataFrame(data[0])
with open(outputfile, 'w' ,encoding='utf-8') as f:
file_obj = open(inputfile,encoding='utf8')
all_lines = file_obj.readlines()
for line in all_lines:
f.write(line[::-1])
file_obj.close()
步驟5:將得到的反轉句子進行之後去除字首中相同的詞,然後反轉回去
步驟6:刪除評論中小於4個字元的評論
#-*- coding: utf-8 -*-
import pandas as pd
inputfile = '.../meidi_jd_process_5.txt' #評論檔案
outputfile = '.../meidi_jd_process_end.txt' #句子倒序
data = pd.read_csv(inputfile, encoding = 'utf-8', header = None)
data = pd.DataFrame(data[0])
with open(outputfile, 'w' ,encoding='utf-8') as f:
file_obj = open(inputfile,encoding='utf8')
all_lines = file_obj.readlines()
for line in all_lines:
if(len(line)>4):
f.write(line)
file_obj.close()
步驟7:通過ROSTCM6提取出正面與負面評價
步驟8:去除正面與負面評價檔案中的前面的評分
#-*- coding: utf-8 -*-
import pandas as pd
#引數初始化
inputfile1 = '.../meidi_jd_process_end_1.txt'
inputfile2 = '.../meidi_jd_process_end_2.txt'
outputfile1 = '.../meidi_jd_neg.txt'
outputfile2 = '.../meidi_jd_pos.txt'
data1 = pd.read_csv(inputfile1, encoding = 'utf-8', header = None) #讀入資料
data2 = pd.read_csv(inputfile2, encoding = 'utf-8', header = None)
data1 = pd.DataFrame(data1[0].str.replace('.*?\d+?\\t ', '')) #用正則表示式修改資料
data2 = pd.DataFrame(data2[0].str.replace('.*?\d+?\\t ', ''))
data1.to_csv(outputfile1, index = False, header = False, encoding = 'utf-8') #儲存結果
data2.to_csv(outputfile2, index = False, header = False, encoding = 'utf-8')
步驟9:對正面與負面評價進行分詞
#-*- coding: utf-8 -*-
import pandas as pd
import jieba #匯入結巴分詞
#引數初始化
inputfile1 = '.../meidi_jd_neg.txt'
inputfile2 = '.../meidi_jd_pos.txt'
outputfile1 = '.../meidi_jd_neg_cut.txt'
outputfile2 = '.../meidi_jd_pos_cut.txt'
data1 = pd.read_csv(inputfile1, encoding = 'utf-8', header = None) #讀入資料
data2 = pd.read_csv(inputfile2, encoding = 'utf-8', header = None)
mycut = lambda s: ' '.join(jieba.cut(s)) #自定義簡單分詞函式
data1 = data1[0].apply(mycut) #通過“廣播”形式分詞,加快速度。
data2 = data2[0].apply(mycut)
data1.to_csv(outputfile1, index = False, header = False, encoding = 'utf-8') #儲存結果
data2.to_csv(outputfile2, index = False, header = False, encoding = 'utf-8')
步驟10:對分詞之後的文件建立主題模型
#-*- coding: utf-8 -*-
import pandas as pd
#引數初始化
negfile = 'D:/MLCode/PyCode/chapter15/test/data/meidi_jd_neg_cut.txt'
posfile = 'D:/MLCode/PyCode/chapter15/test/data/meidi_jd_pos_cut.txt'
stoplist = '.../stoplist.txt'
neg = pd.read_csv(negfile, encoding = 'utf-8', header = None,engine='python') #讀入資料
pos = pd.read_csv(posfile, encoding = 'utf-8', header = None,engine='python')
stop = pd.read_csv(stoplist, encoding = 'utf-8', header = None, sep = 'tipdm',engine='python')
#sep設定分割詞,由於csv預設以半形逗號為分割詞,而該詞恰好在停用詞表中,因此會導致讀取出錯
#所以解決辦法是手動設定一個不存在的分割詞,如tipdm。
stop = [' ', ''] + list(stop[0]) #Pandas自動過濾了空格符,這裡手動新增
neg[1] = neg[0].apply(lambda s: s.split(' ')) #定義一個分割函式,然後用apply廣播
neg[2] = neg[1].apply(lambda x: [i for i in x if i not in stop]) #逐詞判斷是否停用詞,思路同上
pos[1] = pos[0].apply(lambda s: s.split(' '))
pos[2] = pos[1].apply(lambda x: [i for i in x if i not in stop])
from gensim import corpora, models
#負面主題分析
neg_dict = corpora.Dictionary(neg[2]) #建立詞典
neg_corpus = [neg_dict.doc2bow(i) for i in neg[2]] #建立語料庫
neg_lda = models.LdaModel(neg_corpus, num_topics = 3, id2word = neg_dict) #LDA模型訓練
for i in range(3):
print(neg_lda.print_topic(i)) #輸出每個主題
#正面主題分析
pos_dict = corpora.Dictionary(pos[2])
pos_corpus = [pos_dict.doc2bow(i) for i in pos[2]]
pos_lda = models.LdaModel(pos_corpus, num_topics = 3, id2word = pos_dict)
for i in range(3):
print(pos_lda.print_topic(i) )#輸出每個主題