1. 程式人生 > >樸素貝葉斯分類器簡單實現文字情感分析

樸素貝葉斯分類器簡單實現文字情感分析

樸素貝葉斯的一般過程:

① 收集資料:可以使用任何方法。

② 準備資料:需要數值型或者布林型資料。

③ 分析資料:有大量特徵時,繪製特徵作用不大,此時使用直方圖效果更好。

④ 訓練演算法:計算不同的獨立特徵的條件概率。

⑤ 測試演算法:計算錯誤率。

⑥ 使用演算法:一個常見的樸素貝葉斯應用是文件分類。可以在任意的分類場景中使用樸素貝葉斯分類器,不一定非要是文字。

程式碼如下:

import numpy as np


def load_dataset():
	sent_list = [['蒙牛', '很', '牛'],
	['蒙牛', '又', '出來', '丟人', '了'],
	['獎品', '很', '給力', '為', '蒙牛', '為', '獎品'],
	['珍愛', '生命', '遠離', '蒙牛'],
	['蒙牛', '大果粒', '就是', '好吃', '好吃', '好吃'],
	['好在', '一直', '不吃', '蒙牛']]
	
	class_vec = [1, -1, 1, -1, 1, -1]
	
	return sent_list, class_vec

def create_vocab_list(dataset):
	vocab_set = set([])
	
	for doc in dataset:
		vocab_set = vocab_set | set(doc)
	
	return list(vocab_set)

def set_of_words2vec(vocab_list, input_set):
	return_vec = [0] * len(vocab_list)
	
	for word in input_set:
		if word in vocab_list:
			return_vec[vocab_list.index(word)] = 1
	
	return return_vec

def trainNB(train_matrix, train_catagory):
	num_train_docs = len(train_matrix)
	num_words = len(train_matrix[0])
	pos_num = 0
	for i in train_catagory:
		if i == 1:
			pos_num += 1
	pAbusive = pos_num / float(num_train_docs)
	p0_num = np.ones(num_words)
	p1_num = np.ones(num_words)
	p0_demon = 2.0
	p1_demon = 2.0
	
	for i in range(num_train_docs):
		if train_catagory[i] == 1:
			p1_num += train_matrix[i]
			p1_demon += sum(train_matrix[i])
		else:
			p0_num += train_matrix[i]
			p0_demon += sum(train_matrix[i])
	
	p1_vect = np.log(p1_num / p1_demon)
	p0_vect = np.log(p0_num / p0_demon)
	
	return p0_vect, p1_vect, pAbusive

def classifyNB(vec2classify, p0_vec, p1_vec, pClass1):
	p1 = sum(vec2classify * p1_vec) + np.log(pClass1)
	p0 = sum(vec2classify * p0_vec) + np.log(1.0 - pClass1)
	
	if p1 > p0:
		return 1
	elif p0 > p1:
		return -1
	else:
		return 0
	
	
list_sents, list_classes = load_dataset()
my_vocab_list = create_vocab_list(list_sents)
train_mat = []
for sent_in_doc in list_sents:
	train_mat.append(set_of_words2vec(my_vocab_list, sent_in_doc))

p0V, p1V, pAb = trainNB(train_mat, list_classes)
test_entry1 = ['蒙牛', '真', '好吃', '好', '給力']
test_entry2 = ['再也', '不吃', '蒙牛', '了']

print(classifyNB(np.array(set_of_words2vec(my_vocab_list, test_entry1)), p0V, p1V, pAb))
print(classifyNB(np.array(set_of_words2vec(my_vocab_list, test_entry2)), p0V, p1V, pAb))