1. 程式人生 > >【python與機器學習入門3】樸素貝葉斯2——垃圾郵件分類

【python與機器學習入門3】樸素貝葉斯2——垃圾郵件分類

參考部落格:樸素貝葉斯基礎篇之言論過濾器 (po主Jack-Cui,《——大部分內容轉載自

                 

參考書籍:《機器學習實戰》——第四章4.6

樸素貝葉斯基礎內容見前篇《——

1 資料集

ham資料夾 正常郵件

spam資料夾 垃圾郵件

2 資料處理

3 訓練

4 測試

#!/usr/bin/env python
#_*_coding:utf-8_*_
from numpy import *
import re
'''
    文字分類-過濾垃圾郵件
'''


'''切分文字'''
def textParse(bigString):
    listOfTokens = re.split(r'\W*',bigString)
    return [tok.lower() for tok in listOfTokens if len(tok)>2]


'''建立詞彙表'''
def createVocablist(dataset):
    vocabList = set([])
    for data in dataset:
        vocabList = vocabList | set(data)
    return list(vocabList)

'''詞彙轉向量'''
def setOfWord2Vec(vocabList,dataset):
    returnVec = [0] * len(vocabList)
    for word in dataset:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1 #單詞出現標1
        else:
            print "sorry this word %s is not in our vocablist" % word
    return  returnVec


'''計算p(wi|ci)
   該類下,該單詞出現的概率
   需要計算每一類每個單詞出現的次數(分子) 和 每一類出現過的總單詞數(分母)
'''
def trainNB(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pSpam = float(sum(trainCategory)) / float(numTrainDocs)
    p1Num = ones(numWords)
    p0Num = ones(numWords) #記錄每個單詞在該類出現的次數,是一個向量
    p1Denom = 2.0
    p0Denom = 2.0 #屬於該類的總單詞出現次數,是一個數
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vec = log(p1Num / p1Denom)
    p0Vec = log(p0Num / p0Denom)
    return p0Vec,p1Vec,pSpam

'''計算文件屬於某個類別i的概率:p = p(w0|ci)p(w1|ci)p(w2|ci)~p(wn|ci)*p(ci)以下是二分類'''
def classifyNB(vec2classify,p0vec,p1vec,pclass1):
    p1 = sum(vec2classify * p1vec) + log(pclass1)
    p0 = sum(vec2classify * p0vec) + log(1 - pclass1)
    if p1 > p0:
        #print "p1=%f" % p1
        return 1
    else:
        #print "p0=%f" % p0
        return 0


'''
    
    random.uniform(a, b),用於生成一個指定範圍內的隨機符點數
'''

def spamTest():
    classList = [] ; docList = []
    '''匯入檔案共50個'''
    for i in range(1,26):
        wordList = textParse(open('email/spam/%d.txt' % i,'r').read())
        classList.append(1)
        docList.append(wordList)
        wordList = textParse(open('email/ham/%d.txt' % i, 'r').read())
        classList.append(0)
        docList.append(wordList)
    '''建立詞彙表'''
    vocabList = createVocablist(docList)
    #print len(vocabList)
    #print vocabList
    trainSetIndex = range(50)
    testSetIndex = []
    #print docList
    '''劃分訓練集和測試集 取10個測試集 40個訓練集'''
    for i in range(10):
        randIndex = int(random.uniform(0,len(trainSetIndex)))
        testSetIndex.append(trainSetIndex[randIndex])#避免重複
        #print  trainSetIndex[randIndex]
        del(trainSetIndex[randIndex])
    numTrainDoc = len(trainSetIndex)
    numTestDoc = len(testSetIndex)
    trainMat = [];  testMat = [];
    trainClass = []; testClass = [];
    '''訓練集資料向量化 資料集標籤'''
    for i in range(numTrainDoc):
        trainMat.append(setOfWord2Vec(vocabList,docList[trainSetIndex[i]]))
        trainClass.append(classList[trainSetIndex[i]])
    p0Vec, p1Vec, pSpam = trainNB(trainMat,trainClass)
    errorCount = 0.0
    '''測試集資料向量化 資料集標籤 進行測試'''
    for i in range(numTestDoc):
        testMat = setOfWord2Vec(vocabList,docList[testSetIndex[i]])
        testClass = classList[testSetIndex[i]]
        classResult = classifyNB(testMat,p0Vec,p1Vec,pSpam)
        if classResult != testClass:
            print "classify wrong:origin %d" % testClass
            errorCount += 1
    print "error rate = %.2f" % (errorCount / numTestDoc)



if __name__ == '__main__':
   spamTest()