1. 程式人生 > >中文短文字分類

中文短文字分類

在這裡插入圖片描述

特徵提取+樸素貝葉斯模型:

import random
import jieba
import pandas as pd
#載入停用詞
stopwords=pd.read_csv('D://input_py//day06//stopwords.txt',index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')
stopwords=stopwords['stopword'].values
#載入語料
laogong_df = pd.read_csv('D://input_py//day06//beilaogongda.csv', encoding='utf-8', sep=',')
laopo_df = pd.read_csv('D://input_py//day06//beilaopoda.csv', encoding='utf-8', sep=',')
erzi_df = pd.read_csv('D://input_py//day06//beierzida.csv', encoding='utf-8', sep=',')
nver_df = pd.read_csv('D://input_py//day06//beinverda.csv', encoding='utf-8', sep=',')
#刪除語料的NAN行
laogong_df.dropna(inplace=True)
laopo_df.dropna(inplace=True)
erzi_df.dropna(inplace=True)
nver_df.dropna(inplace=True)
#轉換
laogong = laogong_df.segment.values.tolist()
laopo = laopo_df.segment.values.tolist()
erzi = erzi_df.segment.values.tolist()
nver = nver_df.segment.values.tolist()

#定義分詞和打標籤函式preprocess_text
#引數content_lines即為上面轉換的list
#引數sentences是定義的空list,用來儲存打標籤之後的資料
#引數category 是型別標籤
def preprocess_text(content_lines, sentences, category):
    for line in content_lines:
        try:
            segs=jieba.lcut(line)
            segs = [v for v in segs if not str(v).isdigit()]#去數字
            segs = list(filter(lambda x:x.strip(), segs))   #去左右空格
            segs = list(filter(lambda x:len(x)>1, segs)) #長度為1的字元
            segs = list(filter(lambda x:x not in stopwords, segs)) #去掉停用詞
            sentences.append((" ".join(segs), category))# 打標籤
        except Exception:
            print(line)
            continue
sentences = []
preprocess_text(laogong, sentences, 0)
preprocess_text(laopo, sentences, 1)
preprocess_text(erzi, sentences, 2)
preprocess_text(nver, sentences, 3)
random.shuffle(sentences)
# 輸出前10條資料
# for sentence in sentences[:10]:
#         print(sentence[0], sentence[1])  # 下標0是詞列表,1是標籤
# 定義文字抽取詞袋模型特徵
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(
    analyzer='word', # tokenise by character ngrams
    max_features=4000,  # keep the most common 1000 ngrams
)
# 把語料資料切分
from sklearn.model_selection import train_test_split
x, y = zip(*sentences)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1256)
# 把訓練資料轉換為詞袋模型
vec.fit(x_train)
# 演算法建模和模型訓練
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(vec.transform(x_train), y_train)
# 計算 AUC 值
print(classifier.score(vec.transform(x_test), y_test))

結果評分為:0.6587

特徵提取+svm模型:

import random
import jieba
import pandas as pd
#載入停用詞
stopwords=pd.read_csv('D://input_py//day06//stopwords.txt',index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')
stopwords=stopwords['stopword'].values
#載入語料
laogong_df = pd.read_csv('D://input_py//day06//beilaogongda.csv', encoding='utf-8', sep=',')
laopo_df = pd.read_csv('D://input_py//day06//beilaopoda.csv', encoding='utf-8', sep=',')
erzi_df = pd.read_csv('D://input_py//day06//beierzida.csv', encoding='utf-8', sep=',')
nver_df = pd.read_csv('D://input_py//day06//beinverda.csv', encoding='utf-8', sep=',')
#刪除語料的NAN行
laogong_df.dropna(inplace=True)
laopo_df.dropna(inplace=True)
erzi_df.dropna(inplace=True)
nver_df.dropna(inplace=True)
#轉換
laogong = laogong_df.segment.values.tolist()
laopo = laopo_df.segment.values.tolist()
erzi = erzi_df.segment.values.tolist()
nver = nver_df.segment.values.tolist()

#定義分詞和打標籤函式preprocess_text
#引數content_lines即為上面轉換的list
#引數sentences是定義的空list,用來儲存打標籤之後的資料
#引數category 是型別標籤
def preprocess_text(content_lines, sentences, category):
    for line in content_lines:
        try:
            segs=jieba.lcut(line)
            segs = [v for v in segs if not str(v).isdigit()]#去數字
            segs = list(filter(lambda x:x.strip(), segs))   #去左右空格
            segs = list(filter(lambda x:len(x)>1, segs)) #長度為1的字元
            segs = list(filter(lambda x:x not in stopwords, segs)) #去掉停用詞
            sentences.append((" ".join(segs), category))# 打標籤
        except Exception:
            print(line)
            continue
sentences = []
preprocess_text(laogong, sentences, 0)
preprocess_text(laopo, sentences, 1)
preprocess_text(erzi, sentences, 2)
preprocess_text(nver, sentences, 3)
random.shuffle(sentences)
# 把語料資料切分
from sklearn.model_selection import train_test_split
x, y = zip(*sentences)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1256)
# 改變特徵向量模型
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(
    analyzer='word', # tokenise by character ngrams
    ngram_range=(1,4),  # use ngrams of size 1 and 2
    max_features=20000,  # keep the most common 1000 ngrams
)
vec.fit(x_train)
# 用svm演算法進行模型訓練
from sklearn.svm import SVC
svm = SVC(kernel='linear')
svm.fit(vec.transform(x_train), y_train)
print(svm.score(vec.transform(x_test), y_test))

結果評分為:0.9976