中文情感分析 glove+LSTM

阿新 • • 發佈：2018-06-14

load .get reat return 左右 strip() 加載 models pyplot

最近嘗試了一下中文的情感分析。

主要使用了Glove和LSTM。語料數據集采用的是中文酒店評價語料

1、首先是訓練Glove，獲得詞向量（這裏是用的300d）。這一步使用的是jieba分詞和中文維基。

2、將中文酒店評價語料進行清洗，並分詞。分詞後轉化為詞向量的表示形式。

3、使用LSTM網絡進行訓練。

最終的正確率在91%左右

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May 30 13:52:23 2018

@author: xyli
處理酒店評價語料數據，
分詞，並轉化為Glove向量
"""
import sys
 
import os
import chardet
import jieba
import re
import gensim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import 
 Masking
from keras.layers import Dense, Input, Flatten, Activation
from keras.layers import Conv1D, GlobalMaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional,Reshape
from keras.models import Sequential, Model
from Attention_layer import Attention_layer

from keras.layers import Convolution2D, MaxPooling2D  
 
from keras.utils import np_utils 


def loadGLoveModel(filename):
    embeddings_index = {}
    f = open(filename)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype=‘float32‘)
        embeddings_index[word] = coefs
    f.close()
    return embeddings_index

def word2Glovec(List,model):
    vec=[]
    insert = [float(0) for i in range(300)] #300表示vec的維度
    insert = np.asarray(insert, dtype=‘float32‘)
    for w in List:
        v = model.get(w)
        if v is None:
            vec.append(insert)
        else:
            vec.append(v)
    return vec

def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
#    string = string.decode(‘utf-8‘)
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\‘", "", string)
    string = re.sub(r"\"", "", string)
    string = re.sub(r"\r\n", "", string)
    string = re.sub(r"\r", "", string)
    string = re.sub(r"\,","",string)
    string = re.sub(r"\.","",string)
    string = re.sub(r"\，","",string)
    string = re.sub(r"\。","",string)
    string = re.sub(r"\（","",string)
    string = re.sub(r"\）","",string)
    string = re.sub(r"\(","",string)
    string = re.sub(r"\)","",string)
    string = re.sub(r"\“","",string)
    string = re.sub(r"\”","",string)
    return string.strip()

def fitList(List,n):
    L = len(List)
#    insert = [0 for i in range(300)]
    insert = ‘!‘
    if L < n:
        d=n-L
        appList=[insert for i in range(d)]
        List+=appList
    else:
        if L>n:
            List=List[0:n]
    return List

def readData(filename):
    
    
    with open(filename, ‘rb‘) as f:
        data = f.read()
        data=data.decode(‘gb18030‘,‘ignore‘)
        data=clean_str(data)
        seg_list = jieba.cut(data)  # 默認是精確模式
    segList=[]
    for s in seg_list:
        s=clean_str(s)
        segList.append(s)
    return segList
        
def loadData():
    Corpus_DIR = "data/ChnSentiCorp_htl_unba_10000"
    DIR=[‘/neg‘,‘/pos‘]
    commentList=[]
    rootdir = Corpus_DIR+DIR[0]
    filelist = os.listdir(rootdir) #列出文件夾下所有的目錄與文件
    labelList=[[0.0,1.0] for i in range(0,len(filelist))]
    for i in range(0,len(filelist)):
       path = os.path.join(rootdir,filelist[i])
       if os.path.isfile(path):
              templist=readData(path)
              commentList.append(templist)
    
    rootdir = Corpus_DIR+DIR[1]
    filelist = os.listdir(rootdir) #列出文件夾下所有的目錄與文件
    labelList2=[[1.0,0.0] for i in range(0,len(filelist))]
    for i in range(0,len(filelist)):
       path = os.path.join(rootdir,filelist[i])
       if os.path.isfile(path):
              templist=readData(path)
              commentList.append(templist)
    labelList+=labelList2
    return commentList,labelList

if __name__==‘__main__‘:
    List,labelList=loadData()  #加載語料數據
    gloveModel=loadGLoveModel(‘model/zhs_wiki_glove.vectors.300d.txt‘)  #加載glove模型數據
    countList=[]
    commentVecList=[]
    n=100
    for c in List:
        countList.append(len(c))
        glovec=word2Glovec(fitList(c,n),gloveModel)
        commentVecList.append(glovec)
        
    VALIDATION_SPLIT = 0.2
    
    commentVecList=np.array(commentVecList)
    labelList=np.array(labelList)
    indices = np.arange(commentVecList.shape[0])
    np.random.shuffle(indices)
    data = commentVecList[indices]
    labels = labelList[indices]
    
    nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
    x_train = data[:-nb_validation_samples]
    y_train = labels[:-nb_validation_samples]
    x_val = data[-nb_validation_samples:]
    y_val = labels[-nb_validation_samples:]
    
    model = Sequential()
    model.add(LSTM(120, input_shape=(x_train.shape[1], x_train.shape[2]),return_sequences=True))
#    model.add(Activation(‘relu‘)) #激活層 
#    model.add(Attention_layer())
    model.add(Bidirectional(LSTM(60,return_sequences=True)))
#    model.add(Attention_layer())
#    model.add(Activation(‘relu‘)) #激活層 
    model.add(Dropout(0.3)) #神經元隨機失活
    model.add(Bidirectional(LSTM(30,return_sequences=False)))
    model.add(Dropout(0.3)) #神經元隨機失活
    model.add(Dense(y_train.shape[1], activation=‘softmax‘))
    model.compile(loss=‘categorical_crossentropy‘, optimizer=‘adam‘, metrics=[‘accuracy‘])
    model.summary()
    model.fit(x_train, y_train, validation_data=(x_val, y_val),
              epochs=25, batch_size=200)

本文還在完善中。。。

中文情感分析 glove+LSTM

load .get reat return 左右 strip() 加載 models pyplot 最近嘗試了一下中文的情感分析。主要使用了Glove和LSTM。語料數據集采用的是中文酒店評價語料 1、首先是訓練Glove，獲得詞向量（這裏是用的300d）。這一步使用的是

中文情感分析語料庫【下載】

轉自部落格：https://blog.csdn.net/noter16/article/details/75340354 http://blog.leanote.com/post/doubleseven/%E4%B8%AD%E6%96%87%E6%83%85%E6%84%9F%E5%88%86%

Word2vec進行中文情感分析

''' Chinese sentiment analysis ''' from sklearn.cross_validation import train_test_split from gensim.models.word2vec import Word2Vec import numpy

snownlp中文情感分析[正負面sentiments/相似度sim]

安裝 pip install snownlp 導包 from snownlp import SnowNLP # 載入情感分析模組 from snownlp import sentiment 匯入資料集 text=pd.read_excel("./自然語言文字.xls

用WordNet實現中文情感分析

1. 分析中文的情感分析可以用詞林做，詞林有一大類（Ｇ類）對應心理活動，但是相對於wordnet還是太簡單了．因此使用nltk+wordnet的方案，如下： 1) 中文分詞：結巴分詞 3) 情感分析：wordnet的sentiwordn

R語言中文情感分析包:cnSentimentR

R語言中文情感分析. 該包使用jiebaR分詞, svm[e1071]進行分類; 主要的函式:cnsr.predict, cnsr.train, cnsr.prepare, cnsr.topic.word, cnsr.keyword

中文情感分析——snownlp類庫原始碼註釋及使用

最近發現了snownlp這個庫，這個類庫是專門針對中文文字進行文字挖掘的。主要功能：中文分詞（Character-Based Generative Model）詞性標註（TnT 3-gram 隱馬）情感分析（現在訓練資料主要是買賣東西時的評價，所以對其他的一些可能效果不是很好，待

基於 LSTM 電影評論情感分析

0、前言 RNN網路因為使用了單詞的序列資訊，所以準確率要比前向傳遞神經網路要高。網路結構：首先，將單詞傳入 embedding層，之所以使用嵌入層，是因為單詞數量太多，使用嵌入式詞向量來表示單詞更有效率。在這裡我們使用word2vec方式來實現，而且特別神奇的是，我們只需

NLP之情感分析：基於python程式設計(jieba庫)實現中文文字情感分析(得到的是情感評分)

NLP之情感分析：基於python程式設計(jieba庫)實現中文文字情感分析(得到的是情感評分) 輸出結果 1、測試物件 data1= '今天上海的天氣真好！我的心情非常高興！如果去旅遊的話我會非常興奮！和你一起去旅遊我會更加幸福！' data2= '今天上海天氣真差,非常討厭下雨,把

[TensorFlow深度學習深入]實戰三·分別使用DNN,CNN與RNN(LSTM)做文字情感分析(機器如何讀懂人心)

[TensorFlow深度學習深入]實戰三·使用Word2Vec與RNN(LSTM)做文字情感分析(機器如何讀懂人心) 用到了 DNN CNN Word2Vec RNN(LSTM) 不太清楚的可以回顧我們之前的博文。使用了全連線,卷積神經網路與迴

python的中文文字挖掘庫snownlp進行購物評論文字情感分析例項

昨晚上發現了snownlp這個庫，很開心。先說說我開心的原因。我本科畢業設計做的是文字挖掘，用R語言做的，發現R語言對文字處理特別不友好，沒有很多強大的庫，特別是針對中文文字的，加上那時候還沒有學機器學習演算法。所以很頭疼，後來不得已用了一個視覺化的軟體R

Keras + LSTM + 詞向量情感分類/情感分析實驗

背景簡介本人是深度學習入門的菜菜菜鳥一枚… 利用LSTM + word2vec詞向量進行文字情感分類/情感分析實驗，吸收了網上的資源和程式碼並嘗試轉化為自己的東西~ 實驗環境 win7 64位系統 Anaconda 4.3.0 , Python

【深度學習與Theano】LSTM網路-情感分析

概述本文提供一個如何使用Theano實現使用LSTM結構的RNN例子。模型被用來通過電影綜述來進行情感分析，綜述資料來源於Large Movie Review Dataset，即IMDB。在這個任務中，給定一個電影，這個模型能夠預測電影是posit

深度學習----基於keras的LSTM三分類的文字情感分析原理及程式碼

文章目錄背景介紹理論介紹 RNN應用場景 word2vec 演算法 Word2Vec：高維來了句向量資料預處理與詞向量模型訓練 LS

基於tensorflow的CNN和LSTM文字情感分析對比（附完整程式碼）

如今科技日益發展、網路技術不斷深入到大眾生活中，貼吧、網站、電子郵件，使用者評論等使得人們有更多的便捷方式在網路中發表自己的意見和看法。這些數量龐大的文字中的情感資訊有著極大的研究價值和實用價值。而如何能夠從眾多文字資訊和資料中準確而高效地分析出文字中所蘊含的情感，並判斷情感極性，對情感做出分類，是

LSTM Theano sentiment analysis 深度學習情感分析教程

學習LSTM最好的教程之一莫過是deep learning tutorial 見 http://deeplearning.net/tutorial/lstm.html 這裡的Sentiment analysis實際上和 Topic classification有點像首先學

在-TensorFlow-上使用-LSTM-進行情感分析

作者：chen_h 微訊號 & QQ：862251340 微信公眾號：coderpai 你可以從 Github 上面下載到所有的原始碼。在這篇教程中，我們將介紹如何將深度學習技術應用到情感分析中。該任務可以被認為是從一個句子，一

中文文章情感分析-開源工具分享

最近研究了一陣子中文情感分析的一些paper，很感興趣，於是最終決定寫一箇中文情感分析的工具。作為開源思想的忠實粉絲，我也獻醜一次，把自己拙劣的程式碼呈現給大家，歡迎大家拍板磚。希望這個工具能給大家帶來一些實際的用處。目前，這個工具只實現了使用一種基本演算法來預測文章的情

lstm 做文字的情感分析

github上可以參考的程式碼 https://github.com/BUPTLdy/Sentiment-Analysis/blob/master/code/Sentiment_lstm.py https://github.com/life-is-good/CommentF

NLP入門（十）使用LSTM進行文字情感分析

情感分析簡介文字情感分析（Sentiment Analysis）是自然語言處理（NLP）方法中常見的應用，也是一個有趣的基本任務，尤其是以提煉文字情緒內容為目的的分類。它是對帶有情感色彩的主觀性文字進行分析、處理、歸納和推理的過程。本文將介紹情感分析中的情感極性（傾向）分析。所謂情感極性分析，指的

中文情感分析 glove+LSTM

相關推薦