1. 程式人生 > >電商產品評論的資料情感分析python程式碼實現

電商產品評論的資料情感分析python程式碼實現

步驟1:從爬取的資料中提取對應的評論資訊

#-*- coding: utf-8 -*-
import pandas as pd

inputfile = '.../huizong.csv' #評論彙總檔案
outputfile = '.../meidi_jd.txt' #評論提取後儲存路徑
data = pd.read_csv(inputfile, encoding = 'utf-8')
data = data[[u'評論']][data[u'品牌'] == u'美的']
data.to_csv(outputfile, index = False, header = False, encoding = 'utf-8')

步驟2:刪除評論資訊中重複的評論

#-*- coding: utf-8 -*-
import pandas as pd
inputfile = '.../meidi_jd.txt' #評論檔案
outputfile = '.../meidi_jd_process_1.txt' #評論處理後儲存路徑
data = pd.read_csv(inputfile, encoding = 'utf-8', header = None)
l1 = len(data)
data = pd.DataFrame(data[0].unique())
l2 = len(data)
data.to_csv(outputfile, index = False, header = False, encoding = 'utf-8')
print(u'刪除了%s條評論。' %(l1 - l2))

步驟3:刪除評論的字首中相同的詞

#-*- coding: utf-8 -*-
import codecs
inputfile = '.../meidi_jd_process_1.txt' #評論檔案
outputfile = '.../meidi_jd_process_2.txt' #評論處理後儲存路徑
f = codecs.open(inputfile ,'r','utf-8')
f1=codecs.open(outputfile,'w','utf-8')
fileList = f.readlines()
f.close()
for A_string in fileList:
    temp1= A_string.strip('\n')       #去掉每行最後的換行符'\n'
    temp2 = temp1.lstrip('\ufeff')
    temp3= temp2.strip('\r')
    char_list=list( temp3)
    list1=['']
    list2=['']
    del1=[]
    flag=['']
    i=0
    while(i<len(char_list)):
        if (char_list[i]==list1[0]):
            if (list2==['']):
                list2[0]=char_list[i]
            else:
                if (list1==list2):
                    t=len(list1)
                    m=0
                    while(m<t):
                        del1.append( i-m-1)
                        m=m+1
                    list2=['']
                    list2[0]=char_list[i]
                else:
                    list1=['']
                    list2=['']
                    flag=['']
                    list1[0]=char_list[i]
                    flag[0]=i
        else:
            if (list1==list2)and(list1!=[''])and(list2!=['']):
                if len(list1)>=2:
                    t=len(list1)
                    m=0
                    while(m<t):
                        del1.append( i-m-1)
                        m=m+1
                    list1=['']
                    list2=['']
                    list1[0]=char_list[i]
                    flag[0]=i
            else:
                if(list2==['']):
                    if(list1==['']):
                        list1[0]=char_list[i]
                        flag[0]=i
                    else:
                       list1.append(char_list[i])
                       flag.append(i)
                else:
                    list2.append(char_list[i])
        i=i+1
        if(i==len(char_list)):
           if(list1==list2):
                    t=len(list1)
                    m=0
                    while(m<t):
                        del1.append( i-m-1)
                        m=m+1
                    m=0
                    while(m<t):
                        del1.append(flag[m])
                        m=m+1
    a=sorted(del1)
    t=len(a)-1
    while (t>=0):
        #print(char_list[a[t]])
        del char_list[a[t]]
        t=t-1
    str1 = "".join(char_list)
    str2=str1.strip() #刪除兩邊空格
    f1.writelines(str2+'\r\n')
f1.close()

步驟4:將得到的每個句子進行反轉

#-*- coding: utf-8 -*-
import pandas as pd
inputfile = '.../meidi_jd_process_2.txt' #評論檔案
outputfile = '.../meidi_jd_process_3.txt' #句子倒序
data = pd.read_csv(inputfile, encoding = 'utf-8', header = None)
data = pd.DataFrame(data[0])
with open(outputfile, 'w' ,encoding='utf-8') as f:
    file_obj = open(inputfile,encoding='utf8')
    all_lines = file_obj.readlines()
    for line in all_lines:
        f.write(line[::-1])
    file_obj.close()

步驟5:將得到的反轉句子進行之後去除字首中相同的詞,然後反轉回去
步驟6:刪除評論中小於4個字元的評論

#-*- coding: utf-8 -*-
import pandas as pd
inputfile = '.../meidi_jd_process_5.txt' #評論檔案
outputfile = '.../meidi_jd_process_end.txt' #句子倒序
data = pd.read_csv(inputfile, encoding = 'utf-8', header = None)
data = pd.DataFrame(data[0])
with open(outputfile, 'w' ,encoding='utf-8') as f:
    file_obj = open(inputfile,encoding='utf8')
    all_lines = file_obj.readlines()
    for line in all_lines:
        if(len(line)>4):
            f.write(line)
    file_obj.close()

步驟7:通過ROSTCM6提取出正面與負面評價
步驟8:去除正面與負面評價檔案中的前面的評分

#-*- coding: utf-8 -*-
import pandas as pd

#引數初始化
inputfile1 = '.../meidi_jd_process_end_1.txt'
inputfile2 = '.../meidi_jd_process_end_2.txt'
outputfile1 = '.../meidi_jd_neg.txt'
outputfile2 = '.../meidi_jd_pos.txt'

data1 = pd.read_csv(inputfile1, encoding = 'utf-8', header = None) #讀入資料
data2 = pd.read_csv(inputfile2, encoding = 'utf-8', header = None)

data1 = pd.DataFrame(data1[0].str.replace('.*?\d+?\\t ', '')) #用正則表示式修改資料
data2 = pd.DataFrame(data2[0].str.replace('.*?\d+?\\t ', ''))

data1.to_csv(outputfile1, index = False, header = False, encoding = 'utf-8') #儲存結果
data2.to_csv(outputfile2, index = False, header = False, encoding = 'utf-8')

步驟9:對正面與負面評價進行分詞

#-*- coding: utf-8 -*-
import pandas as pd
import jieba #匯入結巴分詞

#引數初始化
inputfile1 = '.../meidi_jd_neg.txt'
inputfile2 = '.../meidi_jd_pos.txt'
outputfile1 = '.../meidi_jd_neg_cut.txt'
outputfile2 = '.../meidi_jd_pos_cut.txt'

data1 = pd.read_csv(inputfile1, encoding = 'utf-8', header = None) #讀入資料
data2 = pd.read_csv(inputfile2, encoding = 'utf-8', header = None)

mycut = lambda s: ' '.join(jieba.cut(s)) #自定義簡單分詞函式
data1 = data1[0].apply(mycut) #通過“廣播”形式分詞,加快速度。
data2 = data2[0].apply(mycut)

data1.to_csv(outputfile1, index = False, header = False, encoding = 'utf-8') #儲存結果
data2.to_csv(outputfile2, index = False, header = False, encoding = 'utf-8')

步驟10:對分詞之後的文件建立主題模型

#-*- coding: utf-8 -*-
import pandas as pd

#引數初始化
negfile = 'D:/MLCode/PyCode/chapter15/test/data/meidi_jd_neg_cut.txt'
posfile = 'D:/MLCode/PyCode/chapter15/test/data/meidi_jd_pos_cut.txt'
stoplist = '.../stoplist.txt'

neg = pd.read_csv(negfile, encoding = 'utf-8', header = None,engine='python') #讀入資料
pos = pd.read_csv(posfile, encoding = 'utf-8', header = None,engine='python')
stop = pd.read_csv(stoplist, encoding = 'utf-8', header = None, sep = 'tipdm',engine='python')
#sep設定分割詞,由於csv預設以半形逗號為分割詞,而該詞恰好在停用詞表中,因此會導致讀取出錯
#所以解決辦法是手動設定一個不存在的分割詞,如tipdm。
stop = [' ', ''] + list(stop[0]) #Pandas自動過濾了空格符,這裡手動新增
neg[1] = neg[0].apply(lambda s: s.split(' ')) #定義一個分割函式,然後用apply廣播
neg[2] = neg[1].apply(lambda x: [i for i in x if i not in stop]) #逐詞判斷是否停用詞,思路同上
pos[1] = pos[0].apply(lambda s: s.split(' '))
pos[2] = pos[1].apply(lambda x: [i for i in x if i not in stop])
from gensim import corpora, models
#負面主題分析
neg_dict = corpora.Dictionary(neg[2]) #建立詞典
neg_corpus = [neg_dict.doc2bow(i) for i in neg[2]] #建立語料庫
neg_lda = models.LdaModel(neg_corpus, num_topics = 3, id2word = neg_dict) #LDA模型訓練
for i in range(3):
  print(neg_lda.print_topic(i)) #輸出每個主題
#正面主題分析
pos_dict = corpora.Dictionary(pos[2])
pos_corpus = [pos_dict.doc2bow(i) for i in pos[2]]
pos_lda = models.LdaModel(pos_corpus, num_topics = 3, id2word = pos_dict)
for i in range(3):
  print(pos_lda.print_topic(i) )#輸出每個主題