1. 程式人生 > >機器學習演算法Python實現:基於情感詞典的文字情感分析

機器學習演算法Python實現:基於情感詞典的文字情感分析

# -*- coding:utf-8 -*
#本程式碼是在jupyter notebook上實現,author:huzhifei, create time:2018/8/14
#本指令碼主要實現了基於python通過已有的情感詞典對文字資料做的情感分析的專案目的

#匯入對應的包及相關的自定義的jieba詞典
import jieba
import numpy as np
jieba.load_userdict("C:\\Users\\Desktop\\中文分詞詞庫整理\\中文分詞詞庫整理\\百度分詞詞庫.txt") 

# 開啟詞典檔案,返回列表
def open_dict(Dict='hahah',path = 'C:\\Users\\Desktop\\Textming\\'):
    path = path + '%s.txt' %Dict
    dictionary = open(path, 'r', encoding='utf-8',errors='ignore')
    dict = []
    for word in dictionary:
        word = word.strip('\n')
        dict.append(word)
    return dict

def judgeodd(num):  #往情感詞前查詢否定詞,找完全部否定詞,若數量為奇數,乘以-1,若數量為偶數,乘以1.
    if num % 2 == 0:
        return 'even'
    else:
        return 'odd'


deny_word = open_dict(Dict='deny')#否定詞詞典
posdict = open_dict(Dict='positive')#積極情感詞典
negdict = open_dict(Dict = 'negative')#消極情感詞典

degree_word = open_dict(Dict = 'degree',path='C:\\Users\\AAS-1413\\Desktop\\Textming\\')#程度詞詞典

#為程度詞設定權重
mostdict = degree_word[degree_word.index('extreme')+1: degree_word.index('very')] #權重4,即在情感前乘以3
verydict = degree_word[degree_word.index('very')+1: degree_word.index('more')] #權重3
moredict = degree_word[degree_word.index('more')+1: degree_word.index('ish')]#權重2
ishdict = degree_word[degree_word.index('ish')+1: degree_word.index('last')]#權重0.5
seg_sentence=[]


def sentiment_score_list(data):
    for i in data:
        seg_sentence.append(i.replace(' ',','))#去除逗號後的評論資料集
    #seg_sentence=data.replace(' ',',').split(',')#以逗號分隔
    count1 = []
    count2 = []
    for sen in seg_sentence:
        #print(sen)# 迴圈遍歷每一個評論
        segtmp = jieba.lcut(sen, cut_all=False) # 把句子進行分詞,以列表的形式返回
        #print(segtmp)
        i = 0 #記錄掃描到的詞的位置
        a = 0 #記錄情感詞的位置
        poscount = 0 # 積極詞的第一次分值
        poscount2 = 0 # 積極反轉後的分值
        poscount3 = 0 # 積極詞的最後分值(包括歎號的分值)
        negcount = 0
        negcount2 = 0
        negcount3 = 0
        for word in segtmp:
            if word in posdict: # 判斷詞語是否是積極情感詞
                poscount +=1
                c = 0
                for w in segtmp[a:i]: # 掃描情感詞前的程度詞
                    if w in mostdict:
                        poscount *= 4.0
                    elif w in verydict:
                        poscount *= 3.0
                    elif w in moredict:
                       poscount *= 2.0
                    elif w in ishdict:
                        poscount *= 0.5
                    elif w in deny_word: c+= 1
                if judgeodd(c) == 'odd': # 掃描情感詞前的否定詞數
                    poscount *= -1.0
                    poscount2 += poscount
                    poscount = 0
                    poscount3 = poscount + poscount2 + poscount3
                    poscount2 = 0
                else:
                    poscount3 = poscount + poscount2 + poscount3
                    poscount = 0
                a = i+1
            elif word in negdict: # 消極情感的分析,與上面一致
                negcount += 1
                d = 0
                for w in segtmp[a:i]:
                    if w in mostdict:
                        negcount *= 4.0
                    elif w in verydict:
                        negcount *= 3.0
                    elif w in moredict:
                        negcount *= 2.0
                    elif w in ishdict:
                        negcount *= 0.5
                    elif w in degree_word:
                        d += 1
                if judgeodd(d) == 'odd':
                    negcount *= -1.0
                    negcount2 += negcount
                    negcount = 0
                    negcount3 = negcount + negcount2 + negcount3
                    negcount2 = 0
                else:
                    negcount3 = negcount + negcount2 + negcount3
                    negcount = 0
                a = i + 1
            elif word == '!' or word == '!': # 判斷句子是否有感嘆號
                for w2 in segtmp[::-1]: # 掃描感嘆號前的情感詞,發現後權值+2,然後退出迴圈
                    if w2 in posdict:
                        poscount3 += 2
                    elif w2 in negdict:
                        negcount3 += 2
                    else:
                        poscount3 +=0
                        negcount3 +=0
                        break
            else:
                poscount3=0
                negcount3=0
            i += 1

            # 以下是防止出現負數的情況
            pos_count = 0
            neg_count = 0
            if poscount3 <0 and negcount3 > 0:
                neg_count += negcount3 - poscount3
                pos_count = 0
            elif negcount3 <0 and poscount3 > 0:
                pos_count = poscount3 - negcount3
                neg_count = 0
            elif poscount3 <0 and negcount3 < 0:
                neg_count = -pos_count
                pos_count = -neg_count
            else:
                pos_count = poscount3
                neg_count = negcount3
            count1.append([pos_count,neg_count]) #返回每條評論打分後的列表
            #print(count1)
        count2.append(count1)
        count1=[]
        #print(count2)
    return count2  #返回所有評論打分後的列表

def sentiment_score(senti_score_list):#分析完所有評論後,正式對每句評論打情感分
    #score = []
    s=''
    w=''
    for review in senti_score_list:#senti_score_list
        #print(review)
        score_array =  np.array(review)
        #print(score_array)
        Pos = np.sum(score_array[:,0])#積極總分
        Neg = np.sum(score_array[:,1])#消極總分
        AvgPos = np.mean(score_array[:,0])#積極情感均值
        AvgPos = float('%.lf' % AvgPos)
        AvgNeg = np.mean(score_array[:, 1])#消極情感均值
        AvgNeg = float('%.1f' % AvgNeg)
        StdPos = np.std(score_array[:, 0])#積極情感方差
        StdPos = float('%.1f' % StdPos)
        StdNeg = np.std(score_array[:, 1])#消極情感方差
        StdNeg = float('%.1f' % StdNeg)
        #s+=([Pos,Neg,AvgPos,AvgNeg,StdPos,StdNeg]))
        s+='\n'+str([Pos, Neg])
        #score.append([Pos,Neg])
        res=Pos-Neg
        if res>0:
            w+='\n'+'好評'
            print ('該條評論是:好評')
        elif res<0:
            w+='\n'+'差評'
            print ('該條評論是:差評')
        else:
            w+='\n'+'中評'
            print ('該條評論是:中評')
    #print(w)
	return w



#讀取要做情感分析的文字
data=open("content.txt","r",errors='ignore')


#呼叫函式做實體分析
sentiment_score(sentiment_score_list(data))

#將函式返回結果存入txt中
f=open('s.txt','w',errors='ignore')
f.write(sentiment_score(sentiment_score_list(data)))
f.close()