Python自然語言處理實戰（8）：情感分析技術

阿新 • • 發佈：2019-01-20

實戰電影評論情感分析

情感分析是一段文字表達的情緒狀態。其中，一段文字可以使一個句子、一個段落或者一個文件。主要涉及兩個問題：文字表達和文字分類。在深度學習出現之前，主流的表示方法有BOW(詞袋模型)和topic model(主題模型)，分類模型主要有SVM和LR。

載入資料：IMDB情感分析資料集，訓練集和測試集分別包含了25000條已標註的電影評論，滿分了10分，小於等於4為負面評論。

# -*- coding: utf-8 -*-

import numpy as np 
# 載入已訓練好的詞典向量模型，包含400000的文字向量，每行有50維的資料
words_list = np.load('wordsList.npy')
print('載入word列表')
words_list = words_list.tolist()   # 轉化為list
words_list = [word.decode('UTF-8') for word in words_list]
word_vectors = np.load('wordVectors.npy')
print('載入文字向量')

print(len(words_list))
print(word_vectors.shape)

Home_index = words_list.index("home")
print(word_vectors[Home_index])

# 載入電影資料
import os
from os.path import isfile, join
pos_files = ['pos/' + f for f in os.listdir('pos/') if isfile(join('pos/', f))]
neg_files = ['neg/' + f for f in os.listdir('neg/') if isfile(join('neg/', f))]
num_words = []
for pf in pos_files:
	with open(pf, "r", encoding='utf-8') as f:
		line = f.readline()
		counter = len(line.split())
		num_words.append(counter)
print('正面評價完結')

for pf in neg_files:
	with open(pf, "r", encoding='utf-8') as f:
		line = f.readline()
		counter = len(line.split())
		num_words.append(counter)
print('負面評價完結')

num_files = len(num_words)
print('檔案總數', num_files)
print('所有的詞的數量', sum(num_words))
print('平均檔案詞的長度', sum(num_words)/len(num_words))

'''
# 視覺化
import matplotlib
import matplotlib.pyplot as plt
matplotlib.use('qt4agg')
# 指定預設字型
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family'] = 'sans-serif'
#%matplotlib inline
plt.hist(num_words, 50, facecolor='g')
plt.xlabel('文字長度')
plt.ylabel('頻次')
plt.axis([0, 1200, 0, 8000])
plt.show()
'''

# 大部分文字都在230之內
max_seg_len = 300

# 將文字生成一個索引矩陣，得到一個25000x300矩陣
import re
strip_special_chars = re.compile("[^A-Za-z0-9 ]+")

def cleanSentence(string):
	string = string.lower().replace("<br />", " ")
	return re.sub(strip_special_chars, "", string.lower())
print('儲存idxMatrix...')
max_seg_num = 300
ids = np.zeros((num_files, max_seg_num), dtype="int32")
file_count = 0
'''
for pf in pos_files:
	with open(pf, "r", encoding="utf-8") as f:
		indexCounter = 0
		line = f.readline()
		cleanedLine = cleanSentence(line)
		split = cleanedLine.split()
		for word in split:
			try:
				ids[file_count][indexCounter] = words_list.index(word)
			except ValueError:
				ids[file_count][indexCounter] = 399999 # 未知的詞
			indexCounter = indexCounter + 1
			if indexCounter >= max_seg_num:
				break
		file_count = file_count + 1
		print(file_count)
print('儲存完成1')
for nf in neg_files:
	with open(nf, "r", encoding="utf-8") as f:
		indexCounter = 0
		line = f.readline()
		cleanedLine = cleanSentence(line)
		split = cleanedLine.split()
		for word in split:
			try:
				ids[file_count][indexCounter] = words_list.index(word)
			except ValueError:
				ids[file_count][indexCounter] = 399999 # 未知的詞
			indexCounter = indexCounter + 1
			if indexCounter >= max_seg_num:
				break
		file_count = file_count + 1

# 儲存到檔案
np.save('idxMatrix', ids)
print('儲存完成2')
'''

# 模型設定
batch_size = 24
lstm_units = 64
num_labels = 2
iterations = 200000
max_seg_num = 250
ids = np.load('idsMatrix.npy')

# 返回一個數據集的迭代器， 返回一批訓練集合
from random import randint
def get_train_batch():
	labels = []
	arr = np.zeros([batch_size, max_seg_num])
	for i in range(batch_size):
		if (i % 2 == 0):
			num = randint(1, 11499)
			labels.append([1, 0])
		else:
			num = randint(13499, 24999)
			labels.append([0, 1])
		arr[i] = ids[num-1: num]
	return arr, labels

def get_test_batch():
	labels = []
	arr = np.zeros([batch_size, max_seg_num])
	for i in range(batch_size):
		num = randint(11499, 13499)
		if (num <= 12499):
			labels.append([1, 0])
		else:
			labels.append([0, 1])
		arr[i] = ids[num-1:num]
	return arr, labels

num_dimensions = 300  # Dimensions for each word vector

import tensorflow as tf
tf.reset_default_graph()
labels = tf.placeholder(tf.float32, [batch_size, num_labels])
input_data = tf.placeholder(tf.int32, [batch_size, max_seg_num])

data = tf.Variable(tf.zeros([batch_size, max_seg_num, num_dimensions]), dtype=tf.float32)
data = tf.nn.embedding_lookup(word_vectors, input_data)

# 配置LSTM網路
lstmCell = tf.contrib.rnn.BasicLSTMCell(lstm_units)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75) # 避免一些過擬合
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)

# 第一個輸出可以被認為是最後的隱藏狀態，該向量將重新確定維度，然後乘以一個權重加上偏置，獲得最終的label
weight = tf.Variable(tf.truncated_normal([lstm_units, num_labels]))
bias = tf.Variable(tf.constant(0.1, shape=[num_labels]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

# 預測函式以及正確率評估引數
correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(labels, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# 將標準的交叉熵損失函式定義為損失值，選擇Adam作為優化函式
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)

#sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement, log_device_placement))
sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False))

#saver = tf.train.Saver()
#saver.restore(sess, tf.train.latest_checkpoint('models'))

iterations = 10
for i in range(iterations):
  next_batch, next_batch_labels = get_test_batch()
  print("正確率:", (sess.run(
      accuracy, {input_data: next_batch, labels: next_batch_labels})) * 100)

'''
# 使用tensorboard視覺化損失值和正確值
import datetime
sess = tf.InteractiveSession()
#tf.device("/cpu:0")
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = tf.summary.FileWriter(logdir, sess.graph)

for i in range(iterations):
	# 下個批次的資料
	nextBatch, nextBatchLabels = get_train_batch();
	sess.run(optimizer, {input_data: nextBatch, labels: nextBatchLabels})
	# 每50次寫入一次leadboard
	if (i % 50 == 0):
		summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
		writer.add_summary(summary, i)
	# 每10,000次儲存一個模型
	if (i % 10000 == 0 and i != 0):
		save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=i)
		print("saved to %s" % save_path)

writer.close()
'''

Python自然語言處理實戰（8）：情感分析技術

實戰電影評論情感分析情感分析是一段文字表達的情緒狀態。其中，一段文字可以使一個句子、一個段落或者一個文件。主要涉及兩個問題：文字表達和文字分類。在深度學習出現之前，主流的表示方法有BOW(詞袋模型)和topic model(主題模型)，分類模型主要有SVM

Python自然語言處理實戰（3）：中文分詞技術

3.1、中文分詞簡介在英文中，單詞本身就是“詞”的表達，一篇英文文章就是“單詞”加分隔符（空格）來表示的，而在漢語中，詞以字為基本單位的，但是一篇文章的語義表達卻仍然是以詞來劃分的。自中文自動分詞被提出以來，歷經將近30年的探索，提出了很多方法，可

Python自然語言處理實戰（1）：NLP基礎

從建模的角度看，為了方便計算機處理，自然語言可以被定義為一組規則或符號的集合，我們組合集合中的符號來傳遞各種資訊。自然語言處理研究表示語言能力、語言應用的模型，通過建立計算機框架來實現這樣的語言模型，並且不斷完善這樣的語言模型，還需要根據語言模型來設計各種實用的系

Python+NLTK自然語言處理學習（二）：常用方法（similar、common_contexts、generate）

一、similar 用來識別文章中和搜尋詞相似的詞語，可以用在搜尋引擎中的相關度識別功能中。 text1.similar("monstrous") 查詢出了text1中與monstrous相關的所有詞語：二、common_contexts 用來識別2個

自然語言處理隨筆（一）

索引中國大學 import pip for earch 清華北京安裝jieba中文分詞命令：pip install jieba 簡單的例子： import jiebaseg_list = jieba.cut("我來到北京清華大學", cut_all=True)pri

【數學之美筆記】自然語言處理部分（一）.md

strip BE 模擬 ges arr 實驗語句次數而不是文字、數字、語言、信息數字、文字和自然語言一樣，都是信息的載體，他們的產生都是為了記錄和傳播信息。但是貌似數學與語言學的關系不大，在很長一段時間內，數學主要用於天文學、力學。本章，我們將回顧一下信息時

自然語言處理NLP（一）

rac 控制臺分析 arm ont 正則表達 stop python none NLP 自然語言：指一種隨著社會發展而自然演化的語言，即人們日常交流所使用的語言；自然語言處理：通過技術手段，使用計算機對自然語言進行各種操作的一個學科； NLP研究的內容

自然語言處理NLP（二）

哪些一個圖片 ali cor res https 的區別進行詞性標註標註語料庫；各詞性標註及其含義自動標註器；默認標註器；正則表達式標註器；查詢標註器； N-gram標註器；一元標註器；分離訓練和測試數據；一般的N-gram的標註

斯坦福大學-自然語言處理入門筆記第七課情感分析（sentiment analysis）

一、情感分析簡述情感分析（sentiment analysis），又叫意見抽取（opinion extraction），意見挖掘（opinion mining）,情感挖掘（sentiment mining）以及主觀分析（subjectivity analysis）。情感分

python自然語言處理-讀書筆記8

#N-Gram Tagging N元語法標註 #一元標註（Unigram Tagging）一元標註器基於一個簡單的統計演算法：對每個識別符號分配這個獨特的識別符號最有可能的標記。例如：它將分配標記 JJ 給詞 frequent 的所有出現，因為frequent 用作一個形容詞（例如：a f

自然語言處理NLP（三）

樣本點中的關鍵度量指標：距離定義：常用距離：歐氏距離，euclidean–通常意義下的距離；馬氏距離，manhattan–考慮到變數間的相關性，且與變數單位無關；餘弦距離，cosi

自然語言處理NLP（四）

實體識別實體識別–分塊型別：名詞短語分塊；標記模式分塊；正則表示式分塊；分塊的表示方法：標記和樹狀圖；分塊器評估；命名實體識別；命名實體定義：指特定型別的個體，是一些確切的名詞短語

自然語言處理入門（2）——中文文字處理利器snownlp

SnowNLP是一個python寫的類庫，可以方便的處理中文文字內容。如中文分詞、詞性標註、情感分析、文字分類、提取文字關鍵詞、文字相似度計算等。 snownlp示例如下所示： # -*- coding: utf-8 -*- """ Created on

《Python自然語言處理實戰：核心技術與算法》PDF新書推介，附帶鏈接地址

核心技術正則表達 eba 詞性標註 6.2 排序系統 ext 書籍本書從各個方面著手，幫助讀者理解NLP的過程，提供了各種實戰場景，結合現實項目背景，幫助讀者理解NLP中的數據結構和算法以及目前主流的NLP技術與方法論，結合信息檢索技術與大數據應用等流行技術，終完成對

簡單應用復旦FNLP自然語言處理工具（一）

FNLP是一個基於機器學習的中文自然語言文字處理的開發工具包，FNLP主要是為中文自然語言處理而開發的工具包，也包含為實現這些任務的機器學習演算法和資料集。　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　----------百度百科資訊

自然語言處理基礎（1）--基本分詞方法

基本的分詞方法包括最大匹配法、最大概率法（最短加權路徑法）、最少分詞法、基於HMM的分詞法、基於互現資訊的分詞方法、基於字元標註的方法和基於例項的漢語分詞方法等。 1.最大匹配法最大匹配法需要一個詞表，分詞的過程中用文字的候選

自然語言處理入門（4）——中文分詞原理及分詞工具介紹

本文首先介紹下中文分詞的基本原理，然後介紹下國內比較流行的中文分詞工具，如jieba、SnowNLP、THULAC、NLPIR，上述分詞工具都已經在github上開源，後續也會附上github連結，以供參考。 1.中文分詞原理介紹 1.1 中文分詞概述中

車萬翔《基於深度學習的自然語言處理》中英文PDF+塗銘《Python自然語言處理實戰核心技術與算法》PDF及代碼

提取實用進行分析表達式詞法快速入門重點 tps 自然語言處理是人工智能領域的一個重要的研究方向，是計算機科學與語言學的交叉學科。隨著互聯網的快速發展，網絡文本尤其是用戶生成的文本呈爆炸性增長，為自然語言處理帶來了巨大的應用需求。但是由於自然語言具有歧義性、動態

python自然語言處理（二）

1詞性標註簡單的理解就是對詞性（POS）進行標註，但在不同的領域，詞性可能是不同的，Penn Treebank pos標記庫：https://blog.csdn.net/u010099495/article/details/46776617 其中程式需要安裝兩個依賴包 nlt

python自然語言處理（一）

1標識化處理何為標識化處理？實際上就是一個將原生字串分割成一系列有意義的分詞，其複雜性根據不同NLP應用而異，目標語言的複雜性也佔了很大部分，例如中文的標識化是要比英文要複雜。 word_tokenize()是一種通用的，面向所有語料庫的標識化方法，基本能應付絕大多數。 reg

Python自然語言處理實戰（8）：情感分析技術

實戰電影評論情感分析

相關推薦