1. 程式人生 > >【機器學習】文字資料的向量化(TF-IDF)---樣本集例項講解+python實現

【機器學習】文字資料的向量化(TF-IDF)---樣本集例項講解+python實現

1.文字資料的向量化

1.1名詞解釋

CF:文件集的頻率,是指詞在文件集中出現的次數

DF:文件頻率,是指出現詞的文件數

IDF:逆文件頻率,idf = log(N/(1+df)),N為所有文件的數目,為了相容df=0情況,將分母弄成1+df。

TF:詞在文件中的頻率

TF-IDF:TF-IDF= TF*IDF

1.2文字資料樣本集

為了講解文字資料的向量化,假設我們有4個文字,所有文字一共有6個不同的詞,如下所示。

doc1iphoneguucihuaweiwatchhuawei
doc2huaweiwatchiphonewatchiphonegucci
doc3skirtskirtskirtflower
doc4watchwatchhuawei

1.3計算彙總

iphonewatchguccihuaweiskirtflower
doc1  TF1/51/51/52/500
doc2 TF2/62/61/61/600
doc3 TF00003/41/4
doc4 TF02/301/300

DF

含詞的文件數

232311

IDF

逆文件頻率

=log(N/(1+DF))

log(4/(1+2))

=log(4/3)

log(4/(1+3))

=log(4/4)

log(4/(1+2))

=log(4/3)

log(4/(1+3))

=log(4/4)

log(4/(1+1))

=log(4/2)

log(4/(1+1))

=log(4/2)

doc1 TFIDF1/5*log(4/3)1/5*log(4/4)1/5*log(4/3)2/5*log(4/4)
00
doc2 TFIDF2/6*log(4/3)2/6*log(4/4)1/6*log(4/3)1/6*log(4/4)00
doc3 TFIDF00003/4*log(4/2)1/4*log(4/2)
doc4TFIDF02/3*log(4/4)01/3*log(4/4)00

1.4實現tf-idf

人肉完成,相對來說,tf-idf的實現還比較簡單。

# -*- coding: utf-8 -*-
"""
Author:蔚藍的天空tom
Talk is cheap, show me the code
Aim:實現文字型資料的TF-IDF向量化
"""
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.feature_extraction.text import TfidfTransformer

def sklearn_tfidf():
    tag_list = ['iphone guuci huawei watch huawei',
                'huawei watch iphone watch iphone guuci',
                'skirt skirt skirt flower',
                'watch watch huawei']
    
    vectorizer = CountVectorizer() #將文字中的詞語轉換為詞頻矩陣  
    X = vectorizer.fit_transform(tag_list) #計算個詞語出現的次數
       
    transformer = TfidfTransformer()  
    tfidf = transformer.fit_transform(X)  #將詞頻矩陣X統計成TF-IDF值  
    print(tfidf.toarray())
    
def tfidf_alg():
    docs = np.array(['iphone guuci huawei watch huawei',
                     'huawei watch iphone watch iphone guuci',
                     'skirt skirt skirt flower',
                     'watch watch huawei'])
    
    words = np.array(['iphone', 'guuci', 'huawei', 'watch', 'skirt', 'flower'])
    #calc cf way1, 詞在文件中出現的個數
    cfs = []
    for e in docs:
       cf = [e.count(word) for word in words]
       cfs.append(cf)
    print('cfs way1:\n', np.array(cfs))
    
    #calc cf way2, 詞在文件中出現的個數
    cfs = []
    cfs.extend([e.count(word) for word in words] for e in docs)
    cfs = np.array(cfs)
    print('cfs way2:\n', cfs)
    
    #calc tf way1, 詞在文件中的頻率
    tfs = []
    for e in cfs:
        tf = e/(np.sum(e))
        tfs.append(tf)
    print('tfs way1:\n', np.array(tfs))

    #calc tf way2, 詞在文件中的頻率
    tfs = []
    tfs.extend(e/(np.sum(e)) for e in cfs)#不能使用append()
    print('tfs:\n',np.array(tfs))
    
    #calc df way1, 包含詞的文件個數
    dfs = list(np.zeros(words.size, dtype=int))
    for i in range(words.size):
        for doc in docs:
            if doc.find(words[i]) != -1:
                dfs[i] += 1
    print('calc df way1:', dfs)
    
    #calc df way2, 包含詞的文件個數
    dfs = []
    for i in range(words.size):
        oneHot = [(doc.find(words[i]) != -1 and 1 or 0) for doc in docs]        
        dfs.append(oneHot.count(1))
        #print('word',words[i],'df:',oneHot.count(1))
    print('calc df way2:', dfs)
    
    #calc df way3, 包含文辭的文件個數
    dfs, oneHots = [],[]
    for word in words:
        oneHots.append([(e.find(word) != -1 and 1 or 0) for e in docs])
    dfs.extend(e.count(1) for e in oneHots)
    print('calc oneHots way3:', np.array(oneHots))
    print('calc df way3:', dfs)
    
    #calc df way4, 包含詞的文件個數
    dfs = []
    oneHots = [[doc.find(word) != -1 and 1 or 0 for doc in docs] for word in words]
    dfs.extend(e.count(1) for e in oneHots)
    print('calc oneHots way4:', np.array(oneHots))
    #dfs = np.reshape(dfs, (np.shape(dfs)[0],1)) #列向量1×n
    #print('calc df way4:', dfs)
    
    #calc idf, 計算每個詞的idf(逆向檔案頻率inverse document frequency)
    #log10(N/(1+DF))
    N = np.shape(docs)[0]
    idfs = [(np.log10(N*1.0/(1+e))) for e in dfs]#f(e) = np.log10(N*1.0/(1+e))
    print('idfs:',np.array(idfs))
    
    #calc tf-idf,計算term frequency - inverse document frequency
    tfidfs = []
    for i in range(np.shape(docs)[0]):
        word_tfidf = np.multiply(tfs[i], idfs)
        tfidfs.append(word_tfidf)
        #print('word_tfidf:',word_tfidf)
    print('calc tfidfs:\n', np.array(tfidfs))
    
    print('==================result============================')
    print('\ndocs:\n', np.array(docs))
    
    print('\nwords:\n', np.array(words))
    
    print('\noneHots:\n', np.array(oneHots))
    
    print('\nCF:\n', np.array(cfs))
    
    print('\nTF:\n', np.array(tfs))
    
    print('\nDF:\n', np.array(dfs))
    
    print('\nIDF:\n', np.array(idfs))
    
    print('\nTF-IDF:\n', np.array(tfidfs))
    print('==============================================')
    return    

if __name__=='__main__':
    tfidf_alg()
    #sklearn_tfidf()
    

1.5執行結果

==================result============================

docs:
 ['iphone guuci huawei watch huawei'
 'huawei watch iphone watch iphone guuci' 'skirt skirt skirt flower'
 'watch watch huawei']

words:
 ['iphone' 'guuci' 'huawei' 'watch' 'skirt' 'flower']

oneHots:
 [[1 1 0 0]
 [1 1 0 0]
 [1 1 0 1]
 [1 1 0 1]
 [0 0 1 0]
 [0 0 1 0]]

CF:
 [[1 1 2 1 0 0]
 [2 1 1 2 0 0]
 [0 0 0 0 3 1]
 [0 0 1 2 0 0]]

TF:
 [[ 0.2         0.2         0.4         0.2         0.          0.        ]
 [ 0.33333333  0.16666667  0.16666667  0.33333333  0.          0.        ]
 [ 0.          0.          0.          0.          0.75        0.25      ]
 [ 0.          0.          0.33333333  0.66666667  0.          0.        ]]

DF:
 [2 2 3 3 1 1]

IDF:
 [ 0.12493874  0.12493874  0.          0.          0.30103     0.30103   ]

TF-IDF:
 [[ 0.02498775  0.02498775  0.          0.          0.          0.        ]
 [ 0.04164625  0.02082312  0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.2257725   0.0752575 ]
 [ 0.          0.          0.          0.          0.          0.        ]]
==============================================

(end)