1. 程式人生 > >鬼吹燈文字挖掘5:sklearn實現文字聚類和文字分類

鬼吹燈文字挖掘5:sklearn實現文字聚類和文字分類

1. 準備資料

import numpy as np
import pandas as pd
import re
import jieba

# 章節判斷用變數預處理
def is_chap_head(tmpstr):
    import re
    pattern = re.compile('^第.{1,7}[章|回]')
    return len(pattern.findall(tmpstr))

# 獲取停用詞庫
my_stop_words_path = 'G:\\myLearning\\pythonLearning201712\\myDicts\\新建資料夾\\綜合stopword.txt'
stop_words_dict = []
with open(my_stop_words_path, errors='ignore') as fr:
    for line in fr.readlines():
        stop_words_dict.append(line.strip())
        
# 自定義分詞函式
def my_cut(inTxt):
    inTxt = re.sub('[a-zA-Z0-9]','',inTxt)
    jieba.lcut(inTxt)
    words_list = jieba.lcut(inTxt)
    return ' '.join([w for w in words_list if w not in stop_words_dict and len(w) > 1])

def my_cut2(inTxt):
    inTxt = re.sub('[a-zA-Z0-9]','',inTxt)
    jieba.lcut(inTxt)
    words_list = jieba.lcut(inTxt)
    return [w for w in words_list if w not in stop_words_dict and len(w) > 1]

# 1. 定義讀取資料的函式
def get_txt_data(file_name, words_dict_path, chap_merge = False, cut_func = my_cut2):
    # 先獲取射鵰各章節字串文件
    raw = pd.read_csv(file_name, names = ['txt'],sep = 'aaa',encoding = 'utf-8', engine = 'python')
    raw['is_chap_head'] = raw.txt.apply(is_chap_head)
    # raw['chap']  = 0  #初始化所有章節為0

     # 章節判斷
    chap_num = 0
    for i in range(len(raw)):
        if raw['is_chap_head'][i] == 1:
            chap_num += 1
        raw.loc[i,'chap'] = chap_num
        
    del raw['is_chap_head']
    
    if chap_merge:
        raw = raw.groupby(['chap']).sum()
    
    jieba.load_userdict(words_dict_path) 
    raw['words_list'] = raw.txt.apply(cut_func)
    
    return raw

file_path = 'G:\\自學筆記\\學習筆記:Python資料分析--玩轉文字挖掘\\txt文件\\'
dict_path = 'G:\\自學筆記\\學習筆記:Python資料分析--玩轉文字挖掘\\詞庫\\'
txt_names = ['鬥破蒼穹.txt','誅仙.txt','金庸-射鵰英雄傳txt精校版.txt',\
'鬼吹燈之精絕古城txt全本精校版.txt', '鬼吹燈之雲南蟲谷txt全本精校版.txt']
dict_names = ['鬥破蒼穹詞庫.txt','誅仙詞庫.txt','金庸小說詞庫.txt','鬼吹燈詞庫.txt']

%time dpcq_df = get_txt_data(file_path + txt_names[0], dict_path + dict_names[0], chap_merge = True )   # 文章太長,耗時較長
%time zx_df = get_txt_data(file_path + txt_names[1], dict_path + dict_names[1], chap_merge = True )
%time sdyxz_df = get_txt_data(file_path + txt_names[2], dict_path + dict_names[2], chap_merge = True )
%time gcd1_df = get_txt_data(file_path + txt_names[3], dict_path + dict_names[3], chap_merge = True )
%time gcd2_df = get_txt_data(file_path + txt_names[4], dict_path + dict_names[3], chap_merge = True )

2. 文件相似度的計算

# 1. 使用gensim中的word2vec實習
from gensim.models.word2vec import Word2Vec
n_dim = 300         # 指定向量維度,大樣本量是300至500較好

w2vmodel = Word2Vec(size = n_dim, min_count = 10)     # 至少在10個文件中出現過
w2vmodel.build_vocab(dpcq_df.words_list)                     # 生成詞表
w2vmodel

%time w2vmodel.train(dpcq_df.words_list,total_examples = w2vmodel.corpus_count, epochs = 10)
# 訓練完畢的模型實質
print(w2vmodel.wv['薰兒'].shape)
w2vmodel.wv['薰兒']
# 詞向量間的相似度
w2vmodel.wv.most_similar('鬥技')

[('功法', 0.7798707485198975),
 ('身法鬥技', 0.7401365637779236),
 ('地階', 0.7358179688453674),
 ('玄階高階', 0.7349050045013428),
 ('地階中級', 0.728278398513794),
 ('祕法', 0.7270081639289856),
 ('魂技', 0.7150101661682129),
 ('地階鬥技', 0.6921431422233582),
 ('帝印決', 0.6878658533096313),
 ('飛行鬥技', 0.6844722032546997)]
# 尋找對應關係
w2vmodel.wv.most_similar(positive=['蕭炎','異火'],negative=['小醫仙'],topn=10)

[('獸火', 0.4644716680049896),
 ('淨蓮妖火', 0.4551411271095276),
 ('骨靈冷火', 0.4455055594444275),
 ('火焰', 0.4415768086910248),
 ('隕落心炎', 0.44030460715293884),
 ('海心焰', 0.439494252204895),
 ('佛怒火蓮', 0.43488609790802),
 ('青蓮地心火', 0.4333166480064392),
 ('九龍雷罡火', 0.429574579000473),
 ('五輪', 0.4264797568321228)]
# 尋找不合群的詞
w2vmodel.wv.doesnt_match('蕭炎 薰兒 小醫仙 美杜莎 納蘭嫣然 彩鱗'.split())
'蕭炎'
# 尋找不合群的詞
w2vmodel.wv.doesnt_match('海心焰 青蓮地心火 隕落心炎 淨蓮妖火 納蘭嫣然'.split())
'納蘭嫣然'
# 2. 基於詞袋模型的計算:sklearn實現
cleanchap = [my_cut(w) for w in gcd2_df.txt]

from sklearn.feature_extraction.text import CountVectorizer

countvec = CountVectorizer(min_df=5)

resmtx = countvec.fit_transform(cleanchap)

from sklearn.metrics.pairwise import pairwise_distances

print(pairwise_distances(resmtx, metric = 'cosine').shape)
pairwise_distances(resmtx, metric = 'cosine')

(56, 56)
Out[17]:
array([[0.        , 0.35040081, 0.42686292, ..., 0.65277582, 0.73983346,
        0.67113954],
       [0.35040081, 0.        , 0.41634138, ..., 0.67092083, 0.73334226,
        0.67347242],
       [0.42686292, 0.41634138, 0.        , ..., 0.72646148, 0.76235288,
        0.73821901],
       ...,
       [0.65277582, 0.67092083, 0.72646148, ..., 0.        , 0.52471631,
        0.39741077],
       [0.73983346, 0.73334226, 0.76235288, ..., 0.52471631, 0.        ,
        0.5853083 ],
       [0.67113954, 0.67347242, 0.73821901, ..., 0.39741077, 0.5853083 ,
        0.        ]])
# 使用TF-IDF矩陣進行相似度計算
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer()
tfidf = transformer.fit_transform(resmtx)         # 基於詞頻矩陣X計算TF-IDF值

pairwise_distances(tfidf[:5],metric='cosine')

array([[0.        , 0.54725386, 0.68972   , 0.78551127, 0.80340229],
       [0.54725386, 0.        , 0.63548046, 0.75853139, 0.8193562 ],
       [0.68972   , 0.63548046, 0.        , 0.5750506 , 0.56698607],
       [0.78551127, 0.75853139, 0.5750506 , 0.        , 0.3775796 ],
       [0.80340229, 0.8193562 , 0.56698607, 0.3775796 , 0.        ]])
# 3 gensim實現: gensim計算的相似矩陣很難被sklearn直接使用
from gensim import similarities
from gensim import corpora, models

chaplist = [my_cut2(w) for w in gcd1_df.txt]
dictionary = corpora.Dictionary(chaplist)
corpus = [dictionary.doc2bow(text) for text in chaplist]     # 仍為list of list

simmtx = similarities.MatrixSimilarity(corpus)
simmtx 
<gensim.similarities.docsim.MatrixSimilarity at 0x11f824e3080>
# 4. 基於LDA計算餘弦相似度
# 檢索和第一回內容最相似(所屬主題相同)的章節
simmtx = similarities.MatrixSimilarity(corpus)              # 使用的矩陣種類需要和擬合模型時相同
simmtx
<gensim.similarities.docsim.MatrixSimilarity at 0x11f8083b7f0>
simmtx.index[:].shape
(33, 15668)
# 使用gensim的LDA擬合結果進行演示
from gensim.models.ldamodel import LdaModel
tfidf_model = models.TfidfModel(corpus)        # 建立TF-IDF模型
corpus_tfidf = tfidf_model[corpus]             # 對所需文件計算TF-IDF結果
%time ldamodel = LdaModel(corpus_tfidf, id2word = dictionary, num_topics = 10, passes = 5)

query = gcd1_df.txt[1]
quer_bow = dictionary.doc2bow(my_cut2(query))

lda_vec = ldamodel[quer_bow]         # 轉換為lda模型下的向量
sims = simmtx[lda_vec]               # 進行矩陣內向量和所提供向量的餘弦相似度查詢
sims = sorted(enumerate(sims), key = lambda item:-item[1])
sims

3.  文件聚類

# 進行聚類分析
from sklearn.cluster import KMeans

clf = KMeans(n_clusters = 5)
s = clf.fit(tfidf)
print(s)
clf.cluster_centers_

print(len(clf.labels_))
clf.labels_
56
Out[26]:
array([0, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 2, 2, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1, 0, 0, 0, 3, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4])
gcd2_df['clsres'] = clf.labels_
gcd2_df.head()

chapgrp = gcd2_df.groupby('clsres')
chapcls = chapgrp.agg(sum)              # 只有字串列的情況下,sum函式自動轉為合併字串

cuttxt = lambda x: ' '.join(my_cut2(x))
chapclsres = chapcls.txt.apply(cuttxt)
chapclsres

clsres
0    第一章 車禍 回到 北京 之後 我們 北京 老字號 美味 勝利 召開 第二屆 代表大會 會議...
1    第三十五章 凌雲宮 會仙殿 天宮 巨集偉 華麗 宮殿 正下方 只覺 整個 無比 渺小 宮殿 這...
2    第二章 彩雲客棧 我們 閒談 之間 汽車 下來 茶葉 販子 趕緊 招呼 我們 下車 遮龍山 ...
3    第九章 鬼訊號 自從 離開 部隊 之後 經常 噩夢 整晚 整晚 失眠 北京 做起 古玩 生意...
4    第五十二章 康巴阿公 楊輕嘆 一聲 說道 若言琴 琴聲 何不 若言聲 指頭 何不 於君 指上...
Name: txt, dtype: object
# 列出關鍵詞以刻畫類別特徵
import jieba.analyse as ana

ana.set_stop_words('G:\\自學筆記\\學習筆記:Python資料分析--玩轉文字挖掘\\詞庫\\停用詞.txt')

for item in chapclsres:
    print(ana.extract_tags(item, topK = 10))

['胖子', '獻王', '楊說', '東西', '獻王墓', '屍洞', '墓室', '女屍', '屍體', '葫蘆洞']
['胖子', '水銀', '獻王', '壁畫', '石碑', '宮殿', '天宮', '厲鬼', '巫衣', '楊說']
['竹筏', '胖子', '遮龍山', '獻王', '獻王墓', '山洞', '河道', '水中', '探照燈', '痋術']
['胖子', '玉棺', '機艙', '楊說', '訊號', '登山', '獻王', '肉線', '樹上', '樹身']
['喇嘛', '大個子', '格瑪', '幹事', '連長', '狼群', '古墳', '魔國', '餓狼', '軍醫']
gcd2_df2 = gcd2_df[['txt', 'words_list']]
zx_df['y'] = 'zx'
sdyxz_df['y'] = 'sdyxz'
gcd1_df['y'] = 'gcd1'
gcd2_df2['y'] = 'gcd2'
all_txt = pd.concat([zx_df, sdyxz_df, gcd1_df, gcd2_df2], axis = 0)      #按行合併
all_txt.head()

joinx = lambda x: ' '.join(x)
all_words = all_txt.words_list.apply(joinx)
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
count_vectorizer = CountVectorizer(min_df = 5)
countMat = count_vectorizer.fit_transform(all_words)
countMat
<388x17224 sparse matrix of type '<class 'numpy.int64'>'
	with 394507 stored elements in Compressed Sparse Row format>

tfidf_transformer = TfidfTransformer()
tfidf_vec = tfidf_transformer.fit_transform(countMat)
tfidf_vec
<388x17224 sparse matrix of type '<class 'numpy.float64'>'
	with 394507 stored elements in Compressed Sparse Row format>

from sklearn.cluster import KMeans
km = KMeans(n_clusters=4)
y_pred = km.fit_predict(tfidf_vec)
km.cluster_centers_
array([[0.00160436, 0.        , 0.00056069, ..., 0.00271158, 0.00254711,
        0.        ],
       [0.00280139, 0.00070697, 0.00055412, ..., 0.00033704, 0.        ,
        0.        ],
       [0.00043656, 0.00270005, 0.00024102, ..., 0.        , 0.        ,
        0.00053978],
       [0.00287647, 0.00190007, 0.00010306, ..., 0.        , 0.        ,
        0.00159731]])

4. 文件分類

1) 直接分類

cutlist = lambda x: ' '.join(x)
x_data = all_txt.words_list.apply(cutlist)
y_data = list(all_txt.y)
# (1) 按詞頻分類
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
count_vectorizer = CountVectorizer(min_df = 5)
all_words_count = count_vectorizer.fit_transform(x_data)
all_words_count
<388x17224 sparse matrix of type '<class 'numpy.int64'>'
	with 394507 stored elements in Compressed Sparse Row format>

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(all_words_count, y_data, test_size = 0.2)

# 嘗試邏輯迴歸和SVM
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

my_lr = LogisticRegression()
my_svm1 = SVC(kernel = 'linear')
my_svm2 = SVC(kernel='rbf')

%time my_lr.fit(x_train,y_train)
%time my_svm1.fit(x_train,y_train)
%time my_svm2.fit(x_train,y_train)

from sklearn.metrics import classification_report
print(classification_report(y_test, my_lr.predict(x_test)))
print(classification_report(y_test, my_svm1.predict(x_test)))
print(classification_report(y_test, my_svm2.predict(x_test)))
precision    recall  f1-score   support

       gcd1       1.00      1.00      1.00         7
       gcd2       1.00      1.00      1.00        10
      sdyxz       1.00      1.00      1.00         5
         zx       1.00      1.00      1.00        56

avg / total       1.00      1.00      1.00        78

             precision    recall  f1-score   support

       gcd1       1.00      0.86      0.92         7
       gcd2       0.83      1.00      0.91        10
      sdyxz       1.00      1.00      1.00         5
         zx       1.00      0.98      0.99        56

avg / total       0.98      0.97      0.97        78

             precision    recall  f1-score   support

       gcd1       1.00      0.29      0.44         7
       gcd2       0.62      1.00      0.77        10
      sdyxz       1.00      1.00      1.00         5
         zx       1.00      0.98      0.99        56

avg / total       0.95      0.92      0.91        78
# (2) 只考慮詞是否出現
tests = np.nonzero(all_words_count)    # 找出非0值的行列索引
tests

(array([  0,   0,   0, ..., 387, 387, 387], dtype=int32),
 array([6988, 2301, 8935, ..., 1103, 6942, 9357], dtype=int32))

new_all_words_count = all_words_count
new_all_words_count[tests[0],tests[1]] = 1
new_all_words_count

<388x17224 sparse matrix of type '<class 'numpy.int64'>'
	with 394507 stored elements in Compressed Sparse Row format>

x_train,x_test,y_train,y_test = train_test_split(new_all_words_count, y_data, test_size = 0.2)

my_lr = LogisticRegression()
my_svm1 = SVC(kernel = 'linear')
my_svm2 = SVC(kernel='rbf')

%time my_lr.fit(x_train,y_train)
%time my_svm1.fit(x_train,y_train)
%time my_svm2.fit(x_train,y_train)

print(classification_report(y_test, my_lr.predict(x_test)))
print(classification_report(y_test, my_svm1.predict(x_test)))
print(classification_report(y_test, my_svm2.predict(x_test)))
 precision    recall  f1-score   support

       gcd1       1.00      0.75      0.86         4
       gcd2       0.95      1.00      0.97        19
      sdyxz       1.00      1.00      1.00        10
         zx       1.00      1.00      1.00        45

avg / total       0.99      0.99      0.99        78

             precision    recall  f1-score   support

       gcd1       1.00      0.75      0.86         4
       gcd2       0.95      1.00      0.97        19
      sdyxz       1.00      1.00      1.00        10
         zx       1.00      1.00      1.00        45

avg / total       0.99      0.99      0.99        78

             precision    recall  f1-score   support

       gcd1       0.00      0.00      0.00         4
       gcd2       0.00      0.00      0.00        19
      sdyxz       1.00      1.00      1.00        10
         zx       0.66      1.00      0.80        45

avg / total       0.51      0.71      0.59        78

2)PCA降維

from sklearn.decomposition import PCA
pca = PCA(n_components=0.9)
#PCA不支援sparse mat的操作,先轉換為標準矩陣
all_wc_mtx = all_words_count.todense()
new_x = pca.fit_transform(all_wc_mtx)
new_x_train,new_x_test,new_y_train,new_y_test = train_test_split(new_x,y_data,test_size = 0.3)
new_x_train.shape
(271, 147)
my_lr2 = LogisticRegression()
my_svm21 = SVC(kernel='linear')
my_svm22 = SVC(kernel='rbf')
%time my_lr2.fit(new_x_train, new_y_train)
%time my_svm21.fit(new_x_train, new_y_train)
%time my_svm22.fit(new_x_train, new_y_train)

from sklearn.metrics import classification_report
print(classification_report(new_y_test, my_lr2.predict(new_x_test)))
print(classification_report(new_y_test, my_svm21.predict(new_x_test)))
print(classification_report(new_y_test, my_svm22.predict(new_x_test)))
precision    recall  f1-score   support

       gcd1       0.69      1.00      0.82         9
       gcd2       0.85      1.00      0.92        11
      sdyxz       1.00      0.83      0.91        12
         zx       1.00      0.95      0.98        85

avg / total       0.96      0.95      0.95       117

             precision    recall  f1-score   support

       gcd1       0.82      1.00      0.90         9
       gcd2       1.00      1.00      1.00        11
      sdyxz       1.00      1.00      1.00        12
         zx       1.00      0.98      0.99        85

avg / total       0.99      0.98      0.98       117

             precision    recall  f1-score   support

       gcd1       0.00      0.00      0.00         9
       gcd2       1.00      0.18      0.31        11
      sdyxz       0.00      0.00      0.00        12
         zx       0.74      1.00      0.85        85

avg / total       0.63      0.74      0.65       117

3)使用卡方檢驗進行特徵選擇後再分類

from sklearn.feature_selection import SelectKBest,chi2
"""可嘗試選不同的k"""
model1 = SelectKBest(chi2, k=100)       # 選擇100個最好的特徵
new_x2 = model1.fit_transform(all_words_count,y_data)
new_x2
<388x100 sparse matrix of type '<class 'numpy.int64'>'
	with 3482 stored elements in Compressed Sparse Row format>
new_x_train2, new_x_test2, new_y_train2, new_y_test2 = train_test_split(new_x2, y_data, test_size = 0.3)

my_lr3 = LogisticRegression()
my_svm31 = SVC(kernel='linear')
my_svm32 = SVC(kernel='rbf')
%time my_lr3.fit(new_x_train2, new_y_train2)
%time my_svm31.fit(new_x_train2, new_y_train2)
%time my_svm32.fit(new_x_train2, new_y_train2)

print(classification_report(new_y_test2, my_lr3.predict(new_x_test2)))
print(classification_report(new_y_test2, my_svm31.predict(new_x_test2)))
print(classification_report(new_y_test2, my_svm32.predict(new_x_test2)))
precision    recall  f1-score   support

       gcd1       1.00      0.82      0.90        11
       gcd2       0.87      1.00      0.93        13
      sdyxz       1.00      0.93      0.97        15
         zx       0.99      1.00      0.99        78

avg / total       0.98      0.97      0.97       117

             precision    recall  f1-score   support

       gcd1       1.00      0.91      0.95        11
       gcd2       1.00      1.00      1.00        13
      sdyxz       1.00      1.00      1.00        15
         zx       0.99      1.00      0.99        78

avg / total       0.99      0.99      0.99       117

             precision    recall  f1-score   support

       gcd1       1.00      0.09      0.17        11
       gcd2       0.92      0.85      0.88        13
      sdyxz       0.00      0.00      0.00        15
         zx       0.75      1.00      0.86        78

avg / total       0.70      0.77      0.68       117