python_NLP實戰之情感分析
阿新 • • 發佈:2018-12-18
情感分析的基本方法有:詞法分析,基於機器學習的分析,混合分析
詞法分析運用了由預標記詞彙組成的詞典,使用詞法分析器將輸入文字轉換為單詞序列,將每個新的單詞與字典中的詞彙進行匹配。
機器學習方法的關鍵是合適特徵的選擇。通常有unigram,bigrams,trigrams選為特徵向量
實戰電影評論情感分析
分為5部分
1、訓練或者載入一個詞向量生成模型
2、建立一個用於訓練集的ID矩陣
3、建立LSTM計算單元
4、訓練
5、測試
step1: 載入並分析資料
# encoding:utf-8 import numpy as np wordsList = np.load('wordsList.npy') print('載入word列表') wordsList = wordsList.tolist() wordsList = [word.decode('UTF-8') for word in wordsList] wordVectors = np.load('wordVectors.npy') print('載入文字向量') print(len(wordsList)) print(wordVectors.shape) import os from os.path import isfile, join pos_files = ['pos/' + f for f in os.listdir( 'pos/') if isfile(join('pos/', f))] neg_files = ['neg/' + f for f in os.listdir( 'neg/') if isfile(join('neg/', f))] num_words = [] for pf in pos_files: with open(pf, "r", encoding='utf-8') as f: line = f.readline() counter = len(line.split()) num_words.append(counter) print('正面評價完結') for nf in neg_files: with open(nf, "r", encoding='utf-8') as f: line = f.readline() counter = len(line.split()) num_words.append(counter) print('負面評價完結') num_files = len(num_words) print('檔案總數', num_files) print('所有的詞的數量', sum(num_words)) print('平均檔案詞的長度', sum(num_words) / len(num_words))
step2:將文字生成一個索引矩陣
import re strip_special_chars = re.compile("[^A-Za-z0-9 ]+") num_dimensions = 300 # Dimensions for each word vector def cleanSentences(string): string = string.lower().replace("<br />", " ") return re.sub(strip_special_chars, "", string.lower()) max_seq_num = 250 ids = np.zeros((num_files, max_seq_num), dtype='int32') file_count = 0 for pf in pos_files: with open(pf, "r", encoding='utf-8') as f: indexCounter = 0 line = f.readline() cleanedLine = cleanSentences(line) split = cleanedLine.split() for word in split: try: ids[file_count][indexCounter] = wordsList.index(word) except ValueError: ids[file_count][indexCounter] = 399999 # 未知的詞 indexCounter = indexCounter + 1 if indexCounter >= max_seq_num: break file_count = file_count + 1 for nf in neg_files: with open(nf, "r",encoding='utf-8') as f: indexCounter = 0 line = f.readline() cleanedLine = cleanSentences(line) split = cleanedLine.split() for word in split: try: ids[file_count][indexCounter] = wordsList.index(word) except ValueError: ids[file_count][indexCounter] = 399999 # 未知的詞語 indexCounter = indexCounter + 1 if indexCounter >= max_seq_num: break file_count = file_count + 1 np.save('idsMatrix', ids)
step3: 輔助函式,用來生成一批訓練資料集
def get_train_batch(): labels = [] arr = np.zeros([batch_size, max_seq_num]) for i in range(batch_size): if (i % 2 == 0): num = randint(1, 11499) labels.append([1, 0]) else: num = randint(13499, 24999) labels.append([0, 1]) arr[i] = ids[num - 1:num] return arr, labels def get_test_batch(): labels = [] arr = np.zeros([batch_size, max_seq_num]) for i in range(batch_size): num = randint(11499, 13499) if (num <= 12499): labels.append([1, 0]) else: labels.append([0, 1]) arr[i] = ids[num - 1:num] return arr, labels
step4:模型設定
import tensorflow as tf
tf.reset_default_graph()
labels = tf.placeholder(tf.float32, [batch_size, num_labels])
input_data = tf.placeholder(tf.int32, [batch_size, max_seq_num])
data = tf.Variable(
tf.zeros([batch_size, max_seq_num, num_dimensions]), dtype=tf.float32)
獲得文字向量
data = tf.nn.embedding_lookup(wordVectors, input_data)
配置LSTM的數量
lstmCell = tf.contrib.rnn.BasicLSTMCell(lstm_units)
避免過擬合
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.5)
輸入模型中,用來展開整個網路
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)
weight = tf.Variable(tf.truncated_normal([lstm_units, num_labels]))
bias = tf.Variable(tf.constant(0.1, shape=[num_labels]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)
定義正確的評估函式以及正確率評估引數
correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(labels, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer(lr).minimize(loss)
saver = tf.train.Saver()
with tf.Session() as sess:
if os.path.exists("models") and os.path.exists("models/checkpoint"):
saver.restore(sess, tf.train.latest_checkpoint('models'))
else:
if int((tf.__version__).split('.')[1]) < 12 and int((tf.__version__).split('.')[0]) < 1:
init = tf.initialize_all_variables()
else:
init = tf.global_variables_initializer()
sess.run(init)
iterations = 100
for step in range(iterations):
next_batch, next_batch_labels = get_test_batch()
if step % 20 == 0:
print("step:", step, " 正確率:", (sess.run(
accuracy, {input_data: next_batch, labels: next_batch_labels})) * 100)
if not os.path.exists("models"):
os.mkdir("models")
save_path = saver.save(sess, "models/model.ckpt")
print("Model saved in path: %s" % save_path)