【機器學習】python第三方模組lda包呼叫程式碼
阿新 • • 發佈:2019-01-04
# coding=utf-8 # !/usr/bin/env python ''' 【資料來源樣例】 詞語1 詞語2 詞語3 詞語4 詞語5 詞語6 詞語7 詞語8 詞語9 詞語1 詞語2 詞語3 詞語4 詞語5 詞語1 詞語2 詞語3 詞語4 詞語5 詞語6 詞語7 …… 一行是一篇已切好詞的文字,詞語之間用空格分隔 【主要引數說明】 1.n_topics:主題個數,即需要將這些文字聚成幾類 2.n_iter:迭代次數 【程式輸出說明】 1.doc-topic分佈:即每篇文字屬於每個topic的概率,比如20個topic,那麼第一篇文字的doc-topic的分佈就是該文字屬於這20個 topic的概率(一共20個概率數字) 2.topic-word分佈:即每個topic內詞的分佈,包含這個詞的概率/權重 3.每個topic內權重最高的5個詞語 4.每篇文字最可能的topic ''' import codecs import collections import numpy as np import lda #讀取已切好詞的語料庫所有詞語,去重 filePath = 'F:/getKeyWords/' cutWordsFile = 'cutWordsFile.txt' #語料庫檔案,其內一行是一個已切好詞的文字 wordSet = set() for eachLine1 in codecs.open(filePath + cutWordsFile, 'r', 'utf-8'): lineList1 = eachLine1.split(' ') for i in range(len(lineList1)): wordSet.add(lineList1[i].strip()) wordList = list(wordSet) #生成詞頻矩陣,一行一個文字,一列一個詞語,數值等於該詞語在當前文字中出現的頻次 # 矩陣行數=文字總數,矩陣列數=語料庫去重後詞語總數 #該矩陣是一個大的稀疏矩陣 wordMatrix = [] for eachLine2 in codecs.open(filePath + cutWordsFile, 'r', 'utf-8'): docWords = eachLine2.strip().split(' ') dict1 = collections.Counter(docWords) key1 = list(dict1.keys()) r1 = [] for i in range(len(wordList)): if wordList[i] in key1: r1.append(dict1[wordList[i]]) else: r1.append(0) wordMatrix.append(r1) X = np.array(wordMatrix) #詞頻矩陣 #模型訓練 model = lda.LDA(n_topics = 10, n_iter = 50, random_state = 1) model.fit(X) #doc-topic分佈 print('==================doc:topic==================') doc_topic = model.doc_topic_ print(type(doc_topic)) print(doc_topic.shape) print(doc_topic) #一行為一個doc屬於每個topic的概率,每行之和為1 #topic-word分佈 print('==================topic:word==================') topic_word = model.topic_word_ print(type(topic_word)) print(topic_word.shape) print(topic_word[:, :3]) #一行對應一個topic,即每行是一個topic及該topic下詞的概率分佈,每行之和為1 #每個topic內權重最高的5個詞語 n = 5 print('==================topic top' + str(n) + ' word==================') for i, topic_dist in enumerate(topic_word): topic_words = np.array(wordList)[np.argsort(topic_dist)][:-(n+1):-1] print('*Topic {}\n-{}'.format(i, ' '.join(topic_words))) #每篇文字最可能的topic print('==================doc best topic==================') txtNums = len(codecs.open(filePath + cutWordsFile, 'r', 'utf-8').readlines()) #文字總數 for i in range(10): topic_most_pr = doc_topic[i].argmax() print('doc: {} ,best topic: {}'.format(i, topic_most_pr)) ''' 【程式執行結果如下】 ==================doc:topic================== <class 'numpy.ndarray'> (6543, 10) [[ 0.3137931 0.00344828 0.21034483 ..., 0.21034483 0.00344828 0.00344828] [ 0.002 0.102 0.002 ..., 0.002 0.302 0.122 ] [ 0.58076923 0.00384615 0.00384615 ..., 0.35 0.00384615 0.00384615] ..., [ 0.06 0.00285714 0.00285714 ..., 0.00285714 0.26 0.17428571] [ 0.05121951 0.00243902 0.19756098 ..., 0.73414634 0.00243902 0.00243902] [ 0.003125 0.003125 0.003125 ..., 0.003125 0.003125 0.503125 ]] ==================topic:word================== <class 'numpy.ndarray'> (10, 14849) [[ 5.16569216e-07 5.16569216e-07 5.16569216e-07] [ 4.88126565e-07 4.88126565e-07 4.88126565e-07] [ 4.05227598e-07 4.05227598e-07 4.05227598e-07] [ 4.64630254e-07 4.64630254e-07 4.64630254e-07] [ 4.59569595e-07 4.59569595e-07 1.38330448e-04] [ 5.04172278e-07 5.04172278e-07 5.04172278e-07] [ 4.50724743e-07 4.50724743e-07 4.50724743e-07] [ 5.32552540e-07 5.37878066e-05 5.32552540e-07] [ 4.28183189e-07 4.28183189e-07 4.28183189e-07] [ 4.11413842e-05 4.07340438e-07 4.07340438e-07]] ==================topic top5 word================== *Topic 0 -5個詞(涉及具體業務,具體詞語已遮蔽,下同) *Topic 1 -5個詞 *Topic 2 -5個詞 *Topic 3 -5個詞 *Topic 4 -5個詞 *Topic 5 -5個詞 *Topic 6 -5個詞 *Topic 7 -5個詞 *Topic 8 -5個詞 *Topic 9 -5個詞 ==================doc best topic================== doc: 0 ,best topic: 0 doc: 1 ,best topic: 3 doc: 2 ,best topic: 0 doc: 3 ,best topic: 9 doc: 4 ,best topic: 8 doc: 5 ,best topic: 1 doc: 6 ,best topic: 9 doc: 7 ,best topic: 5 doc: 8 ,best topic: 2 doc: 9 ,best topic: 3 '''