1. 程式人生 > >【機器學習】python第三方模組lda包呼叫程式碼

【機器學習】python第三方模組lda包呼叫程式碼

# coding=utf-8
# !/usr/bin/env python
'''
【資料來源樣例】
詞語1 詞語2 詞語3 詞語4 詞語5 詞語6 詞語7 詞語8 詞語9
詞語1 詞語2 詞語3 詞語4 詞語5
詞語1 詞語2 詞語3 詞語4 詞語5 詞語6 詞語7
……
一行是一篇已切好詞的文字,詞語之間用空格分隔

【主要引數說明】
1.n_topics:主題個數,即需要將這些文字聚成幾類
2.n_iter:迭代次數

【程式輸出說明】
1.doc-topic分佈:即每篇文字屬於每個topic的概率,比如20個topic,那麼第一篇文字的doc-topic的分佈就是該文字屬於這20個
topic的概率(一共20個概率數字)
2.topic-word分佈:即每個topic內詞的分佈,包含這個詞的概率/權重
3.每個topic內權重最高的5個詞語
4.每篇文字最可能的topic
'''
import codecs
import collections
import numpy as np
import lda

#讀取已切好詞的語料庫所有詞語,去重
filePath = 'F:/getKeyWords/'
cutWordsFile = 'cutWordsFile.txt'   #語料庫檔案,其內一行是一個已切好詞的文字
wordSet = set()
for eachLine1 in codecs.open(filePath + cutWordsFile, 'r', 'utf-8'):
    lineList1 = eachLine1.split(' ')
    for i in range(len(lineList1)):
        wordSet.add(lineList1[i].strip())
wordList = list(wordSet)

#生成詞頻矩陣,一行一個文字,一列一個詞語,數值等於該詞語在當前文字中出現的頻次
# 矩陣行數=文字總數,矩陣列數=語料庫去重後詞語總數
#該矩陣是一個大的稀疏矩陣
wordMatrix = []
for eachLine2 in codecs.open(filePath + cutWordsFile, 'r', 'utf-8'):
    docWords = eachLine2.strip().split(' ')
    dict1 = collections.Counter(docWords)
    key1 = list(dict1.keys())
    r1 = []
    for i in range(len(wordList)):
        if wordList[i] in key1:
            r1.append(dict1[wordList[i]])
        else:
            r1.append(0)
    wordMatrix.append(r1)
X = np.array(wordMatrix)    #詞頻矩陣

#模型訓練
model = lda.LDA(n_topics = 10, n_iter = 50, random_state = 1)
model.fit(X)

#doc-topic分佈
print('==================doc:topic==================')
doc_topic = model.doc_topic_
print(type(doc_topic))
print(doc_topic.shape)
print(doc_topic)    #一行為一個doc屬於每個topic的概率,每行之和為1

#topic-word分佈
print('==================topic:word==================')
topic_word = model.topic_word_
print(type(topic_word))
print(topic_word.shape)
print(topic_word[:, :3])    #一行對應一個topic,即每行是一個topic及該topic下詞的概率分佈,每行之和為1

#每個topic內權重最高的5個詞語
n = 5
print('==================topic top' + str(n) + ' word==================')
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(wordList)[np.argsort(topic_dist)][:-(n+1):-1]
    print('*Topic {}\n-{}'.format(i, ' '.join(topic_words)))

#每篇文字最可能的topic
print('==================doc best topic==================')
txtNums = len(codecs.open(filePath + cutWordsFile, 'r', 'utf-8').readlines())   #文字總數
for i in range(10):
    topic_most_pr = doc_topic[i].argmax()
    print('doc: {} ,best topic: {}'.format(i, topic_most_pr))

'''
【程式執行結果如下】
==================doc:topic==================
<class 'numpy.ndarray'>
(6543, 10)
[[ 0.3137931   0.00344828  0.21034483 ...,  0.21034483  0.00344828
   0.00344828]
 [ 0.002       0.102       0.002      ...,  0.002       0.302       0.122     ]
 [ 0.58076923  0.00384615  0.00384615 ...,  0.35        0.00384615
   0.00384615]
 ...,
 [ 0.06        0.00285714  0.00285714 ...,  0.00285714  0.26        0.17428571]
 [ 0.05121951  0.00243902  0.19756098 ...,  0.73414634  0.00243902
   0.00243902]
 [ 0.003125    0.003125    0.003125   ...,  0.003125    0.003125    0.503125  ]]
==================topic:word==================
<class 'numpy.ndarray'>
(10, 14849)
[[  5.16569216e-07   5.16569216e-07   5.16569216e-07]
 [  4.88126565e-07   4.88126565e-07   4.88126565e-07]
 [  4.05227598e-07   4.05227598e-07   4.05227598e-07]
 [  4.64630254e-07   4.64630254e-07   4.64630254e-07]
 [  4.59569595e-07   4.59569595e-07   1.38330448e-04]
 [  5.04172278e-07   5.04172278e-07   5.04172278e-07]
 [  4.50724743e-07   4.50724743e-07   4.50724743e-07]
 [  5.32552540e-07   5.37878066e-05   5.32552540e-07]
 [  4.28183189e-07   4.28183189e-07   4.28183189e-07]
 [  4.11413842e-05   4.07340438e-07   4.07340438e-07]]
==================topic top5 word==================
*Topic 0
-5個詞(涉及具體業務,具體詞語已遮蔽,下同)
*Topic 1
-5個詞
*Topic 2
-5個詞
*Topic 3
-5個詞
*Topic 4
-5個詞
*Topic 5
-5個詞
*Topic 6
-5個詞
*Topic 7
-5個詞
*Topic 8
-5個詞
*Topic 9
-5個詞
==================doc best topic==================
doc: 0 ,best topic: 0
doc: 1 ,best topic: 3
doc: 2 ,best topic: 0
doc: 3 ,best topic: 9
doc: 4 ,best topic: 8
doc: 5 ,best topic: 1
doc: 6 ,best topic: 9
doc: 7 ,best topic: 5
doc: 8 ,best topic: 2
doc: 9 ,best topic: 3
'''