1. 程式人生 > >python實現隨機森林、邏輯回歸和樸素貝葉斯的新聞文本分類

python實現隨機森林、邏輯回歸和樸素貝葉斯的新聞文本分類

ati int ces 平滑 讀取 inf dict http tor

實現本文的文本數據可以在THUCTC下載也可以自己手動爬蟲生成,
本文主要參考:https://blog.csdn.net/hao5335156/article/details/82716923
nb表示樸素貝葉斯
rf表示隨機森林
lg表示邏輯回歸
初學者(我)通過本程序的學習可以鞏固python基礎,學會python文本的處理,和分類器的調用。方便接下來的機器學習的學習。

各個參數直觀的含義:
技術分享圖片

# -*- coding: utf-8 -*-
"""
Created on Thu Nov 29 13:00:46 2018

@author: caoqu
"""
import matplotlib.pyplot as plt
import random
import os
import jieba

from sklearn.naive_bayes import MultinomialNB as NB  
from sklearn.linear_model.logistic import LogisticRegression as LR 
from sklearn.ensemble import RandomForestClassifier as RF 

# 文本處理 --> 生成訓練集 測試集 詞頻集 
def text_processor(text_path, test_size=0.2):
    folder_list = os.listdir(text_path)
    data_list=[]        # 每個元素均為一篇文章
    class_list=[]       # 對應於每篇文章的類別
    # 一個循環讀取一個類別的文件夾
    for folder in folder_list:
        new_folder_path = os.path.join(text_path, folder)     # 類別列表
        # 由於THUCTC文本巨多,所以我從每個類別的文本列表中隨機抽取200個文本用於訓練和測試,可以自行修改
        files = random.sample(os.listdir(new_folder_path), 200)   
        # 一個循環讀取一篇文章
        for file in files:
            with open(os.path.join(new_folder_path, file), ‘r‘, encoding=‘UTF-8‘) as fp:
                raw = fp.read()
            word_cut = jieba.cut(raw, cut_all=False)    #精確模式切分文章
            word_list = list(word_cut)      # 一篇文章一個 word_list
            data_list.append(word_list)     
            class_list.append(folder.encode(‘utf-8‘))   
    # 劃分訓練集和測試集
    # data_class_list[[word_list_one[], 體育], [word_list_two[], 財經], ..., [...]]
    data_class_list = list(zip(data_list, class_list))  
    random.shuffle(data_class_list)     # 打亂順序
    index = int(len(data_class_list) * test_size) + 1   # 訓測比為 8:2
    
    train_list = data_class_list[index:]    
    test_list = data_class_list[:index]
    
    train_data_list, train_class_list = zip(*train_list)    # (word_list_one[],...), (體育,...)
    test_data_list, test_class_list = zip(*test_list)
    
    # 統計詞頻 all_words_dict{"key_word_one":100, "key_word_two":200, ...}
    all_words_dict = {}
    for word_list in train_data_list:
        for word in word_list:
            if all_words_dict.get(word) != None:
                all_words_dict[word] += 1
            else:
                all_words_dict[word] = 1
    
    all_words_tuple_list = sorted(all_words_dict.items(), key=lambda f: f[1], reverse=True)     # 按值降序排序 
    all_words_list = list(list(zip(*all_words_tuple_list))[0])      # all_words_list[word_one, word_two, ...]  
 
    return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list

# 選取特征詞
def words_dict(all_words_list, deleteN, stopwords_set=set()):
    feature_words = []
    n = 1
    for t in range(deleteN, len(all_words_list), 1):
        if n > 1000:    # 維度最大1000
            break
        # 非數字 非停用詞 長度 1-4 之間
        if not all_words_list[t].isdigit() and all_words_list[t] not in stopwords_set and 1 < len(all_words_list[t]) < 5:
            feature_words.append(all_words_list[t])
            n += 1
    return feature_words
    
# 文本特征
def text_features(train_data_list, test_data_list, feature_words):
    def text_feature_(text, feature_words):
        text_words = set(text)
        features = [1 if word in text_words else 0 for word in feature_words]
        return features
    train_feature_list = [text_feature_(text, feature_words) for text in train_data_list]
    test_feature_list = [text_feature_(text, feature_words) for text in test_data_list]
    return train_feature_list, test_feature_list

# 對停用詞去重
def make_word_set(words_file):
    words_set = set()
    with open(words_file, ‘r‘, encoding=‘UTF-8‘) as fp:
        for line in fp.readlines():
            word = line.strip()
            if len(word)>0 and word not in words_set:
                words_set.add(word)
    return words_set

# 列表求均值
def average(accuracy_list):
    sum = 0
    for i in accuracy_list:
        sum += i
    return round(sum/len(accuracy_list),3)

# 分類 同時輸出準確率等
def text_classifier(train_feature_list, test_feature_list, train_class_list, test_class_list, flag):
    if flag == ‘nb‘:
        # 樸素貝葉斯分類器 擬合 默認拉普拉斯平滑 不指定先驗概率先驗概率
        classifier = NB().fit(train_feature_list, train_class_list)      
    if flag == ‘lg‘:
        # 邏輯回歸分類器 指定liblinear為求解最優化問題的算法 最大叠代數 多分類問題策略
        classifier = LR(solver=‘liblinear‘,max_iter=5000, multi_class=‘auto‘).fit(train_feature_list, train_class_list)     
    if flag == ‘rf‘:
        # 隨機森林分類器
        classifier = RF(n_estimators=200).fit(train_feature_list, train_class_list)
    test_accuracy = classifier.score(test_feature_list, test_class_list)        # 測試準確率
    return test_accuracy

def start(flag):
    folder_path = ‘D:/WorkSpace/THUCTC/THUCNews/‘     # 請修改成自己的路徑
    all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = text_processor(folder_path, test_size=0.2)
    stopwords_set = make_word_set(‘D:/WorkSpace/tmp/py/stop_words_cn.txt‘)
    # 文本特征的提取和分類
    deleteNs = range(0,1000,20)
    test_accuracy_list = []
    
    # 每循環一次,去除前 20 個最高詞頻,直到去除 980 個最高詞頻為止
    for deleteN in deleteNs:
        feature_words = words_dict(all_words_list, deleteN, stopwords_set)
        train_feature_list, test_feature_list = text_features(train_data_list, test_data_list, feature_words)
        if flag == ‘nb‘:
            test_accuracy = text_classifier(train_feature_list, test_feature_list, train_class_list, test_class_list, flag=‘nb‘)
        if flag == ‘lg‘:
            test_accuracy = text_classifier(train_feature_list, test_feature_list, train_class_list, test_class_list, flag=‘lg‘)
        if flag == ‘rf‘:
            test_accuracy = text_classifier(train_feature_list, test_feature_list, train_class_list, test_class_list, flag=‘rf‘)
        test_accuracy_list.append(test_accuracy)
    print(flag + ‘平均準確度:‘, average(test_accuracy_list))
    print(flag + ‘最大準確度:‘, round(max(test_accuracy_list), 3))
    return deleteNs, test_accuracy_list
       
if __name__ == "__main__":
    plt.figure(figsize=(13, 11))
    for i in range(5):
        # 1    
        flag = ‘nb‘
        nb_deleteNs, nb_accuracy_list = start(flag)
        flag = ‘lg‘
        lg_deleteNs, lg_accuracy_list = start(flag)
        flag = ‘rf‘
        rf_deleteNs, rf_accuracy_list = start(flag)
        
        # 繪圖
        plt.title(‘Relationship of deleteNs and test_accuracy‘)
        plt.xlabel(‘deleteNs‘)
        plt.ylabel(‘test_accuracy‘)
        plt.grid()
        plt.plot(nb_deleteNs, nb_accuracy_list, ‘b‘, label=‘nb‘)
        plt.plot(lg_deleteNs, lg_accuracy_list, ‘k‘, label=‘lg‘)
        plt.plot(rf_deleteNs, rf_accuracy_list, ‘r‘, label=‘rf‘)
        plt.annotate(‘大‘, xy=((nb_accuracy_list.index(max(nb_accuracy_list))-1)*20, max(nb_accuracy_list)))
        plt.annotate(‘大‘, xy=((lg_accuracy_list.index(max(lg_accuracy_list))-1)*20, max(lg_accuracy_list)))
        plt.annotate(‘大‘, xy=((rf_accuracy_list.index(max(rf_accuracy_list))-1)*20, max(rf_accuracy_list)))
    
    plt.legend() 
    plt.show()

運行結果:技術分享圖片

其他參數請自行修改

python實現隨機森林、邏輯回歸和樸素貝葉斯的新聞文本分類