樸素貝葉斯分類器簡單實現文字情感分析
阿新 • • 發佈:2019-01-08
樸素貝葉斯的一般過程:
① 收集資料:可以使用任何方法。
② 準備資料:需要數值型或者布林型資料。
③ 分析資料:有大量特徵時,繪製特徵作用不大,此時使用直方圖效果更好。
④ 訓練演算法:計算不同的獨立特徵的條件概率。
⑤ 測試演算法:計算錯誤率。
⑥ 使用演算法:一個常見的樸素貝葉斯應用是文件分類。可以在任意的分類場景中使用樸素貝葉斯分類器,不一定非要是文字。
程式碼如下:
import numpy as np def load_dataset(): sent_list = [['蒙牛', '很', '牛'], ['蒙牛', '又', '出來', '丟人', '了'], ['獎品', '很', '給力', '為', '蒙牛', '為', '獎品'], ['珍愛', '生命', '遠離', '蒙牛'], ['蒙牛', '大果粒', '就是', '好吃', '好吃', '好吃'], ['好在', '一直', '不吃', '蒙牛']] class_vec = [1, -1, 1, -1, 1, -1] return sent_list, class_vec def create_vocab_list(dataset): vocab_set = set([]) for doc in dataset: vocab_set = vocab_set | set(doc) return list(vocab_set) def set_of_words2vec(vocab_list, input_set): return_vec = [0] * len(vocab_list) for word in input_set: if word in vocab_list: return_vec[vocab_list.index(word)] = 1 return return_vec def trainNB(train_matrix, train_catagory): num_train_docs = len(train_matrix) num_words = len(train_matrix[0]) pos_num = 0 for i in train_catagory: if i == 1: pos_num += 1 pAbusive = pos_num / float(num_train_docs) p0_num = np.ones(num_words) p1_num = np.ones(num_words) p0_demon = 2.0 p1_demon = 2.0 for i in range(num_train_docs): if train_catagory[i] == 1: p1_num += train_matrix[i] p1_demon += sum(train_matrix[i]) else: p0_num += train_matrix[i] p0_demon += sum(train_matrix[i]) p1_vect = np.log(p1_num / p1_demon) p0_vect = np.log(p0_num / p0_demon) return p0_vect, p1_vect, pAbusive def classifyNB(vec2classify, p0_vec, p1_vec, pClass1): p1 = sum(vec2classify * p1_vec) + np.log(pClass1) p0 = sum(vec2classify * p0_vec) + np.log(1.0 - pClass1) if p1 > p0: return 1 elif p0 > p1: return -1 else: return 0 list_sents, list_classes = load_dataset() my_vocab_list = create_vocab_list(list_sents) train_mat = [] for sent_in_doc in list_sents: train_mat.append(set_of_words2vec(my_vocab_list, sent_in_doc)) p0V, p1V, pAb = trainNB(train_mat, list_classes) test_entry1 = ['蒙牛', '真', '好吃', '好', '給力'] test_entry2 = ['再也', '不吃', '蒙牛', '了'] print(classifyNB(np.array(set_of_words2vec(my_vocab_list, test_entry1)), p0V, p1V, pAb)) print(classifyNB(np.array(set_of_words2vec(my_vocab_list, test_entry2)), p0V, p1V, pAb))