1. 程式人生 > >Python【極簡】文字分類模型

Python【極簡】文字分類模型

樸素貝葉斯-多項分佈【極簡】模型

# 訓練集
texts = ['酸奶蛋糕', '酸奶芝士', '芝士蛋糕', '酸奶芝士蛋糕',
         '文字挖掘', '挖掘資料', '文字資料', '挖掘文字資料']
labels = ['food', 'food', 'food', 'food',
          'IT', 'IT', 'IT', 'IT']

# 分詞
from jieba import lcut
ls_of_words = [lcut(text) for text in texts]

# 構造詞典
from gensim import corpora
dictionary =
corpora.Dictionary(ls_of_words) dt = dictionary.token2id length = len(dt) # one hot representation for word, wid in dt.items(): vector = [0] * length vector[wid] = 1 # 句向量 ls_of_wid = [] for words in ls_of_words: vector = [0] * length for word in words: vector[dt[word]] += 1
ls_of_wid.append(vector) # 資料切分 from sklearn.model_selection import train_test_split train_labels, test_labels, train_wids, test_wids = train_test_split(labels, ls_of_wid) # 貝葉斯模型訓練 from sklearn.naive_bayes import MultinomialNB classifier = MultinomialNB() # 樸素貝葉斯分類器 classifier.fit(train_wids,
train_labels) # 模型測評 score = classifier.score(test_wids, test_labels) print(score)

過程詳解

1、分詞

# 訓練集
texts = ['酸奶蛋糕', '酸奶芝士', '芝士蛋糕', '酸奶芝士蛋糕',
         '文字挖掘', '挖掘資料', '文字資料', '挖掘文字資料']
labels = ['food', 'food', 'food', 'food',
          'IT', 'IT', 'IT', 'IT']
# 分詞
from jieba import lcut
ls_of_words = [lcut(text) for text in texts]
print(ls_of_words)

[[‘酸奶’, ‘蛋糕’],
[‘酸奶’, ‘芝士’],
[‘芝士’, ‘蛋糕’],
[‘文字’, ‘挖掘’],
[‘挖掘’, ‘資料’],
[‘文字’, ‘資料’],
[‘酸奶’, ‘芝士’, ‘蛋糕’],
[‘挖掘’, ‘文字’, ‘資料’]]

2、構造詞典

from gensim import corpora
dictionary = corpora.Dictionary(ls_of_words)
dt = dictionary.token2id
print(dt)

{‘蛋糕’: 0, ‘酸奶’: 1, ‘芝士’: 2, ‘挖掘’: 3, ‘文字’: 4, ‘資料’: 5}

3、one hot representation(冗餘的詞向量)

length = len(dt)
for word, wid in dt.items():
    vector = [0] * length
    vector[wid] = 1
    print(word, vector)

蛋糕 [1, 0, 0, 0, 0, 0]
酸奶 [0, 1, 0, 0, 0, 0]
芝士 [0, 0, 1, 0, 0, 0]
挖掘 [0, 0, 0, 1, 0, 0]
文字 [0, 0, 0, 0, 1, 0]
資料 [0, 0, 0, 0, 0, 1]

4、句向量(詞向量疊加而成)

ls_of_wid = []
for words in ls_of_words:
    vector = [0] * length
    for word in words:
        vector[dt[word]] += 1
    ls_of_wid.append(vector)
print(ls_of_wid)

[[1, 1, 0, 0, 0, 0],
[0, 1, 1, 0, 0, 0],
[1, 0, 1, 0, 0, 0],
[0, 0, 0, 1, 1, 0],
[0, 0, 0, 1, 0, 1],
[0, 0, 0, 0, 1, 1],
[1, 1, 1, 0, 0, 0],
[0, 0, 0, 1, 1, 1]]

5、劃分訓練集和測試集

from sklearn.model_selection import train_test_split
train_labels, test_labels, train_wids, test_wids = train_test_split(labels, ls_of_wid)

6、貝葉斯模型訓練

from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()  # 樸素貝葉斯分類器
classifier.fit(train_wids, train_labels)

7、模型測評

sentence = '挖掘酸奶酸奶酸奶酸奶文字資料'
wids = [0] * length
for wid in dictionary.doc2idx(lcut(sentence)):
    wids[wid] += 1
print('句向量', wids)
print('預測值', classifier.predict([wids]))

句向量 [0, 4, 0, 1, 1, 1]
預測值 [‘food’]

附錄

en cn
MultinomialNB Naive Bayes classifier for multinomial models
Multinomial Distribution 多項式分佈
naive 天真的
vector 向量
doc2idx document to index