1. 程式人生 > >實戰案例-微博情感分析

實戰案例-微博情感分析

實戰案例:微博情感分析

資料:每個文字檔案包含相應類的資料

0:喜悅;1:憤怒;2:厭惡;3:低落

步驟

  1. 文字讀取
  2. 分割訓練集、測試集
  3. 特徵提取
  4. 模型訓練、預測

程式碼:

tools.py
# -*- coding: utf-8 -*-

import re
import jieba.posseg as pseg
import pandas as pd
import math
import numpy as np

# 載入常用停用詞
stopwords1 = [line.rstrip() for line in open('./中文停用詞庫.txt', 'r', encoding='utf-8')]
# stopwords2 
= [line.rstrip() for line in open('./哈工大停用詞表.txt', 'r', encoding='utf-8')] # stopwords3 = [line.rstrip() for line in open('./四川大學機器智慧實驗室停用詞庫.txt', 'r', encoding='utf-8')] # stopwords = stopwords1 + stopwords2 + stopwords3 stopwords = stopwords1 def proc_text(raw_line): """ 處理每行的文字資料 返回分詞結果
""" # 1. 使用正則表示式去除非中文字元 filter_pattern = re.compile('[^\u4E00-\u9FD5]+') chinese_only = filter_pattern.sub('', raw_line) # 2. 結巴分詞+詞性標註 words_lst = pseg.cut(chinese_only) # 3. 去除停用詞 meaninful_words = [] for word, flag in words_lst: # if (word not in stopwords) and (flag == 'v'): # 也可根據詞性去除非動詞等
if word not in stopwords: meaninful_words.append(word) return ' '.join(meaninful_words) def split_train_test(text_df, size=0.8): """ 分割訓練集和測試集 """ # 為保證每個類中的資料能在訓練集中和測試集中的比例相同,所以需要依次對每個類進行處理 train_text_df = pd.DataFrame() test_text_df = pd.DataFrame() labels = [0, 1, 2, 3] for label in labels: # 找出label的記錄 text_df_w_label = text_df[text_df['label'] == label] # 重新設定索引,保證每個類的記錄是從0開始索引,方便之後的拆分 text_df_w_label = text_df_w_label.reset_index() # 預設按80%訓練集,20%測試集分割 # 這裡為了簡化操作,取前80%放到訓練集中,後20%放到測試集中 # 當然也可以隨機拆分80%,20%(嘗試實現下DataFrame中的隨機拆分) # 該類資料的行數 n_lines = text_df_w_label.shape[0] split_line_no = math.floor(n_lines * size) text_df_w_label_train = text_df_w_label.iloc[:split_line_no, :] text_df_w_label_test = text_df_w_label.iloc[split_line_no:, :] # 放入整體訓練集,測試集中 train_text_df = train_text_df.append(text_df_w_label_train) test_text_df = test_text_df.append(text_df_w_label_test) train_text_df = train_text_df.reset_index() test_text_df = test_text_df.reset_index() return train_text_df, test_text_df def get_word_list_from_data(text_df): """ 將資料集中的單詞放入到一個列表中 """ word_list = [] for _, r_data in text_df.iterrows(): word_list += r_data['text'].split(' ') return word_list def extract_feat_from_data(text_df, text_collection, common_words_freqs): """ 特徵提取 """ # 這裡只選擇TF-IDF特徵作為例子 # 可考慮使用詞頻或其他文字特徵作為額外的特徵 n_sample = text_df.shape[0] n_feat = len(common_words_freqs) common_words = [word for word, _ in common_words_freqs] # 初始化 X = np.zeros([n_sample, n_feat]) y = np.zeros(n_sample) print('提取特徵...') for i, r_data in text_df.iterrows(): if (i + 1) % 5000 == 0: print('已完成{}個樣本的特徵提取'.format(i + 1)) text = r_data['text'] feat_vec = [] for word in common_words: if word in text: # 如果在高頻詞中,計算TF-IDF值 tf_idf_val = text_collection.tf_idf(word, text) else: tf_idf_val = 0 feat_vec.append(tf_idf_val) # 賦值 X[i, :] = np.array(feat_vec) y[i] = int(r_data['label']) return X, y def cal_acc(true_labels, pred_labels): """ 計算準確率 """ n_total = len(true_labels) correct_list = [true_labels[i] == pred_labels[i] for i in range(n_total)] acc = sum(correct_list) / n_total return acc

 

main.py

# main.py

# -*- coding: utf-8 -*-


import os
import pandas as pd
import nltk
from tools import proc_text, split_train_test, get_word_list_from_data, \
    extract_feat_from_data, cal_acc
from nltk.text import TextCollection
from sklearn.naive_bayes import GaussianNB

dataset_path = './dataset'
text_filenames = ['0_simplifyweibo.txt', '1_simplifyweibo.txt',
                  '2_simplifyweibo.txt', '3_simplifyweibo.txt']

# 原始資料的csv檔案
output_text_filename = 'raw_weibo_text.csv'

# 清洗好的文字資料檔案
output_cln_text_filename = 'clean_weibo_text.csv'

# 處理和清洗文字資料的時間較長,通過設定is_first_run進行配置
# 如果是第一次執行需要對原始文字資料進行處理和清洗,需要設為True
# 如果之前已經處理了文字資料,並已經儲存了清洗好的文字資料,設為False即可
is_first_run = True


def read_and_save_to_csv():
    """
        讀取原始文字資料,將標籤和文字資料儲存成csv
    """

    text_w_label_df_lst = []
    for text_filename in text_filenames:
        text_file = os.path.join(dataset_path, text_filename)

        # 獲取標籤,即0, 1, 2, 3
        label = int(text_filename[0])

        # 讀取文字檔案
        with open(text_file, 'r', encoding='utf-8') as f:
            lines = f.read().splitlines()

        labels = [label] * len(lines)

        text_series = pd.Series(lines)
        label_series = pd.Series(labels)

        # 構造dataframe
        text_w_label_df = pd.concat([label_series, text_series], axis=1)
        text_w_label_df_lst.append(text_w_label_df)

    result_df = pd.concat(text_w_label_df_lst, axis=0)

    # 儲存成csv檔案
    result_df.columns = ['label', 'text']
    result_df.to_csv(os.path.join(dataset_path, output_text_filename),
                     index=None, encoding='utf-8')


def run_main():
    """
        主函式
    """
    # 1. 資料讀取,處理,清洗,準備
    if is_first_run:
        print('處理清洗文字資料中...', end=' ')
        # 如果是第一次執行需要對原始文字資料進行處理和清洗

        # 讀取原始文字資料,將標籤和文字資料儲存成csv
        read_and_save_to_csv()

        # 讀取處理好的csv檔案,構造資料集
        text_df = pd.read_csv(os.path.join(dataset_path, output_text_filename),
                              encoding='utf-8')

        # 處理文字資料
        text_df['text'] = text_df['text'].apply(proc_text)

        # 過濾空字串
        text_df = text_df[text_df['text'] != '']

        # 儲存處理好的文字資料
        text_df.to_csv(os.path.join(dataset_path, output_cln_text_filename),
                       index=None, encoding='utf-8')
        print('完成,並儲存結果。')

    # 2. 分割訓練集、測試集
    print('載入處理好的文字資料')
    clean_text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename),
                                encoding='utf-8')
    # 分割訓練集和測試集
    train_text_df, test_text_df = split_train_test(clean_text_df)
    # 檢視訓練集測試集基本資訊
    print('訓練集中各類的資料個數:', train_text_df.groupby('label').size())
    print('測試集中各類的資料個數:', test_text_df.groupby('label').size())

    # 3. 特徵提取
    # 計算詞頻
    n_common_words = 200

    # 將訓練集中的單詞拿出來統計詞頻
    print('統計詞頻...')
    all_words_in_train = get_word_list_from_data(train_text_df)
    fdisk = nltk.FreqDist(all_words_in_train)
    common_words_freqs = fdisk.most_common(n_common_words)
    print('出現最多的{}個詞是:'.format(n_common_words))
    for word, count in common_words_freqs:
        print('{}: {}次'.format(word, count))
    print()

    # 在訓練集上提取特徵
    text_collection = TextCollection(train_text_df['text'].values.tolist())
    print('訓練樣本提取特徵...', end=' ')
    train_X, train_y = extract_feat_from_data(train_text_df, text_collection, common_words_freqs)
    print('完成')
    print()

    print('測試樣本提取特徵...', end=' ')
    test_X, test_y = extract_feat_from_data(test_text_df, text_collection, common_words_freqs)
    print('完成')

    # 4. 訓練模型Naive Bayes
    print('訓練模型...', end=' ')
    gnb = GaussianNB()
    gnb.fit(train_X, train_y)
    print('完成')
    print()

    # 5. 預測
    print('測試模型...', end=' ')
    test_pred = gnb.predict(test_X)
    print('完成')

    # 輸出準確率
    print('準確率:', cal_acc(test_y, test_pred))

if __name__ == '__main__':
    run_main()