1. 程式人生 > >中文情感分析 glove+LSTM

中文情感分析 glove+LSTM

load .get reat return 左右 strip() 加載 models pyplot

最近嘗試了一下中文的情感分析。

主要使用了Glove和LSTM。語料數據集采用的是中文酒店評價語料

1、首先是訓練Glove,獲得詞向量(這裏是用的300d)。這一步使用的是jieba分詞和中文維基。

2、將中文酒店評價語料進行清洗,並分詞。分詞後轉化為詞向量的表示形式。

3、使用LSTM網絡進行訓練。

最終的正確率在91%左右

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May 30 13:52:23 2018

@author: xyli
處理酒店評價語料數據,
分詞,並轉化為Glove向量
"""
import sys
import os import chardet import jieba import re import gensim import numpy as np import pandas as pd import matplotlib.pyplot as plt from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.utils.np_utils import to_categorical from keras.layers import
Masking from keras.layers import Dense, Input, Flatten, Activation from keras.layers import Conv1D, GlobalMaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional,Reshape from keras.models import Sequential, Model from Attention_layer import Attention_layer from keras.layers import Convolution2D, MaxPooling2D
from keras.utils import np_utils def loadGLoveModel(filename): embeddings_index = {} f = open(filename) for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype=float32) embeddings_index[word] = coefs f.close() return embeddings_index def word2Glovec(List,model): vec=[] insert = [float(0) for i in range(300)] #300表示vec的維度 insert = np.asarray(insert, dtype=float32) for w in List: v = model.get(w) if v is None: vec.append(insert) else: vec.append(v) return vec def clean_str(string): """ Tokenization/string cleaning for dataset Every dataset is lower cased except """ # string = string.decode(‘utf-8‘) string = re.sub(r"\\", "", string) string = re.sub(r"\‘", "", string) string = re.sub(r"\"", "", string) string = re.sub(r"\r\n", "", string) string = re.sub(r"\r", "", string) string = re.sub(r"\,","",string) string = re.sub(r"\.","",string) string = re.sub(r"\,","",string) string = re.sub(r"\。","",string) string = re.sub(r"\(","",string) string = re.sub(r"\)","",string) string = re.sub(r"\(","",string) string = re.sub(r"\)","",string) string = re.sub(r"\“","",string) string = re.sub(r"\”","",string) return string.strip() def fitList(List,n): L = len(List) # insert = [0 for i in range(300)] insert = ! if L < n: d=n-L appList=[insert for i in range(d)] List+=appList else: if L>n: List=List[0:n] return List def readData(filename): with open(filename, rb) as f: data = f.read() data=data.decode(gb18030,ignore) data=clean_str(data) seg_list = jieba.cut(data) # 默認是精確模式 segList=[] for s in seg_list: s=clean_str(s) segList.append(s) return segList def loadData(): Corpus_DIR = "data/ChnSentiCorp_htl_unba_10000" DIR=[/neg,/pos] commentList=[] rootdir = Corpus_DIR+DIR[0] filelist = os.listdir(rootdir) #列出文件夾下所有的目錄與文件 labelList=[[0.0,1.0] for i in range(0,len(filelist))] for i in range(0,len(filelist)): path = os.path.join(rootdir,filelist[i]) if os.path.isfile(path): templist=readData(path) commentList.append(templist) rootdir = Corpus_DIR+DIR[1] filelist = os.listdir(rootdir) #列出文件夾下所有的目錄與文件 labelList2=[[1.0,0.0] for i in range(0,len(filelist))] for i in range(0,len(filelist)): path = os.path.join(rootdir,filelist[i]) if os.path.isfile(path): templist=readData(path) commentList.append(templist) labelList+=labelList2 return commentList,labelList if __name__==__main__: List,labelList=loadData() #加載語料數據 gloveModel=loadGLoveModel(model/zhs_wiki_glove.vectors.300d.txt) #加載glove模型數據 countList=[] commentVecList=[] n=100 for c in List: countList.append(len(c)) glovec=word2Glovec(fitList(c,n),gloveModel) commentVecList.append(glovec) VALIDATION_SPLIT = 0.2 commentVecList=np.array(commentVecList) labelList=np.array(labelList) indices = np.arange(commentVecList.shape[0]) np.random.shuffle(indices) data = commentVecList[indices] labels = labelList[indices] nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0]) x_train = data[:-nb_validation_samples] y_train = labels[:-nb_validation_samples] x_val = data[-nb_validation_samples:] y_val = labels[-nb_validation_samples:] model = Sequential() model.add(LSTM(120, input_shape=(x_train.shape[1], x_train.shape[2]),return_sequences=True)) # model.add(Activation(‘relu‘)) #激活層 # model.add(Attention_layer()) model.add(Bidirectional(LSTM(60,return_sequences=True))) # model.add(Attention_layer()) # model.add(Activation(‘relu‘)) #激活層 model.add(Dropout(0.3)) #神經元隨機失活 model.add(Bidirectional(LSTM(30,return_sequences=False))) model.add(Dropout(0.3)) #神經元隨機失活 model.add(Dense(y_train.shape[1], activation=softmax)) model.compile(loss=categorical_crossentropy, optimizer=adam, metrics=[accuracy]) model.summary() model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=25, batch_size=200)

本文還在完善中。。。

中文情感分析 glove+LSTM