1. 程式人生 > >文字情感分析+python+正面和負面新聞+新浪微博+情感字典+機器學習

文字情感分析+python+正面和負面新聞+新浪微博+情感字典+機器學習

文字情感分析

從上一篇完成了對新浪微博的爬取,以及模擬登入的問題,小編又開始研究對微博文
本的正面和反面分析,從網上搜索了好多方法,有機器學習和情感字典,可是機器學
習需要比較深的知識鏈,而小編還是小白,所以就選擇了情感字典方法。好了,直接
上程式碼嘛,直接可以執行的。
前提需要安裝相關的庫jieba,
直接用pip install jieba
就能安裝,

一、資料準備

先要準備情感字典,

  1. 情感字典,如:
    最尼瑪 -6.70400012637
    擾民 -6.49756445867
    fuck… -6.32963390433
    RNM -6.21861284426
    wcnmlgb -5.96710044003
    2.5: -5.90459648251
  2. 停用字字典,如:
    !,”,#,$,&
  3. 副詞,如:
    百分之百 6
    倍加 6
    備至 6
    不得了 6
    不堪 6
    不可開交 6
    不亦樂乎 6
  4. 否定詞,如:不,沒,無,非,莫,弗,勿
    如果需要可以去CSDN去下載,我已經上傳了

二、情感分析

#!usr/bin/env python
#coding:utf-8

import jieba

class SentimentAnalysis:
    #初始化
    def __init__(self,sentiment,noword,adverb,stopword)
:
self.__readFile(sentiment,noword,adverb,stopword) #讀取相關文庫 def __readFile(self,sentiment,noword,adverb,stopword): self.__sentList = {} self.__noword = [] self.__adverb = {} self.__stopword = [] #情感詞 sentList = open(sentiment,'rb') for
s in sentList.readlines(): try: s = s.replace('\r\n','').replace('\n','') self.__sentList[s.split(' ')[0]] = s.split(' ')[1] except: pass sentList.close() nowordList = open(noword,'rb') for s in nowordList.readlines(): try: s = s.replace('\r\n','').replace('\n','') self.__noword.append(s) except: print "資料錯誤:"+s nowordList.close() adverbList = open(adverb,'rb') for s in adverbList.readlines(): try: s = s.replace('\r\n','').replace('\n','') self.__adverb[s.split(' ')[0]] = s.split(' ')[1] except: print "資料錯誤:"+s adverbList.close() stopwordList = open(stopword,'rb') for s in stopwordList.readlines(): try: s = s.replace('\r\n','').replace('\n','') self.__stopword.append(s) except: print "資料錯誤:"+s stopwordList.close() def setSentence(self,sentence): self.__sentence = sentence.lstrip() #預處理 def preDetail(self): wordsList = jieba.cut(self.__sentence, cut_all=False) newWords = {} i = 0 for w in wordsList: if w not in self.__stopword: newWords[str(i)] =w i = i+1 senWord = {} notWord = {} degreeWord = {} m = 0 for index in newWords.keys(): if newWords[index] in self.__sentList.keys() and newWords[index] not in self.__noword and newWords[index] not in self.__adverb.keys(): senWord[index] = self.__sentList[newWords[index].encode('utf-8')] elif newWords[index] in self.__noword and newWords[index] not in self.__adverb.keys(): notWord[index] = -1 elif newWords[index] in self.__adverb.keys(): degreeWord[index] = self.__adverb[newWords[index].encode('utf-8')] else: senWord[index] = 0 return senWord,notWord,degreeWord,newWords def getScore(self): senWord,notWord,degreeWord,newWords = self.preDetail() W = 1 score = 0 # 存所有情感詞的位置的列表 senLoc = [] notLoc = [] degreeLoc = [] for i in senWord.keys(): senLoc.append(int(i)) for i in notWord.keys(): notLoc.append(int(i)) for i in degreeWord.keys(): degreeLoc.append(int(i)) senLoc.sort() notLoc.sort() degreeLoc.sort() senloc = -1 for i in range(0, len(newWords)): # 如果該詞為情感詞 if i in senLoc: # loc為情感詞位置列表的序號 senloc += 1 # 直接新增該情感詞分數 score += W * float(senWord[str(i)]) # print "score = %f" % score if senloc < len(senLoc) - 1: # 判斷該情感詞與下一情感詞之間是否有否定詞或程度副詞 # j為絕對位置 if senLoc[senloc] - senLoc[senloc + 1] > 1: for j in range(senLoc[senloc]+1, senLoc[senloc + 1]): # 如果有否定詞 if j in notLoc: W *= -1 # 如果有程度副詞 elif j in degreeLoc: W *= float(degreeWord[j]) else: W = 1 # i定位至下一個情感詞 if senloc < len(senLoc) - 1: i = senLoc[senloc + 1] return score def getAnalysis(): return SentimentAnalysis('情感字典.txt', '否定詞.txt', '副詞.txt', '停用詞.txt') s = analysis.getAnalysis() s.setSentence('句子') #如果分數為正則為正面新聞 #如果位數為負則為負面新聞 print s.getScore()