1. 程式人生 > >機器學習實戰第四章——樸素貝葉斯分類(原始碼解析)

機器學習實戰第四章——樸素貝葉斯分類(原始碼解析)

樸素貝葉斯分類

#coding=utf-8
'''
Created on 2016年1月9日

@author: admin
'''


from numpy import *

# 載入資料集函式
def loadDataSet():
    # 定義郵件列表
    postingList = [['my','dog','has','flea',\
                    'problem','help','please'],
                    ['maybe','not','take','him',\
                     'to','dog','park','stupid'],
                   ['my','dalmation','is','so','cute',\
                    'I','love','him'],
                   ['stop','posting','stupid','worthless','garbage'],
                   ['mr','licks','ate','my','steak','how',\
                    'to','stop','him'],
                   ['quit','buying','worthless','dog','food','stupid']]
    # 定義標籤向量,1——abusive,0——not abusive
    classVec = [0,1,0,1,0,1]
    return postingList,classVec

# 建立詞彙列表
def createVocabList(dataSet):
    # 定義詞彙集
    vocabSet = set([])
    # 遍歷文件
    for document in dataSet:
        # 將每個document合併到vocabSet,|用來聯合兩個集合
        vocabSet = vocabSet | set(document)
    # 返回詞彙集
    return list(vocabSet)

# 把單詞轉換成向量
def setOfWords2Vec(vocabList,inputSet):
    # 定義要返回的向量
    returnVec = [0] * len(vocabList)
    # 遍歷輸出集中的單詞
    for word in inputSet:
        # 單詞在詞彙集中
        if word in vocabList:
            # 對應的位置設為1
            returnVec[vocabList.index(word)] = 1
        else:
            print "the word: %s is not in my Vocabulary!" % word
    # 返回向量
    return returnVec

# 把單詞轉換成向量,用詞袋模型,計算詞出現的次數
def bagOfWords2VecMN(vocabList,inputSet):
    # 定義要返回的向量
    returnVec = [0] * len(vocabList)
    # 遍歷輸出集中的單詞
    for word in inputSet:
        # 單詞在詞彙集中
        if word in vocabList:
            # 對應的詞出現次數 加1
            returnVec[vocabList.index(word)] += 1
    # 返回向量
    return returnVec

# 條件概率的計算
def trainNB0(trainMatrix,trainCategory):
    # 計算文件的數目
    numTrainDocs = len(trainMatrix)
    # 計算單詞的數目
    numWords = len(trainMatrix[0])
    # 計算類別的概率,abusive為1,not abusive為0
    pAbusive =sum(trainCategory) / float(numTrainDocs)
    # 初始化計數器,1行numWords列,p0是not abusive
    # p0Num =zeros(numWords)
    p0Num = ones(numWords)
    # 初始化計數器,p1是abusive
    p1Num = ones(numWords)
    # 初始化分母
    p0Denom = 2.0
    p1Denom = 2.0
    # 遍歷文件
    for i in range(numTrainDocs):
        # 計算abusive對應的詞彙的數目,trainMatrix為0-1值形成的向量
        if trainCategory[i] == 1:
            # p1Num儲存的是每個詞出現的次數
            p1Num += trainMatrix[i]
            # p1Denom儲存的是詞的總數目
            p1Denom += sum(trainMatrix[i])
        # 計算not abusive詞彙的數目
        else:
            # 每個詞在not abusive下出現的次數
            p0Num += trainMatrix[i]
            # not abusive下的總詞數
            p0Denom += sum(trainMatrix[i])
    # 計算abusive下每個詞出現的概率
    # p1Vect = p1Num / p1Denom
    p1Vect = log(p1Num / p1Denom)
    # 計算not abusive下每個詞出現的概率
    # p0Vect = p0Num / p0Denom
    p0Vect = log(p0Num / p0Denom)
    # 返回詞出現的概率和文件為abusive的概率,not abusive的概率為1-pAbusive
    return p0Vect,p1Vect,pAbusive

def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    # 計算abusive的概率
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)
    # 計算not abusive的概率
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
    # 根據概率大小判斷屬於哪個類
    if p1 > p0:
        return 1
    else:
        return 0

# 測試
def testingNB():
    # 載入資料集
    listOPosts,listClass = loadDataSet()
    # 建立詞彙列表
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
    p0V,p1V,pAb = trainNB0(array(trainMat),array(listClass))
    # print p0V,p1V,pAb
    # print trainMat
    testEntry = ['love','my','dalmation']
    thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
    print testEntry,'classified as:',classifyNB(thisDoc, p0V, p1V, pAb)
    testEntry = ['stupid','garbage']
    thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
    print testEntry,'classified as:',classifyNB(thisDoc, p0V, p1V, pAb)

# 文字解析   
# 輸入是字串,輸出是單詞列表
def textParse(bigString):    
    # 匯入正則表示式的包
    import re
    # 用正則表示式分割字串
    listOfTokens = re.split(r'\W*', bigString)
    # 返回小寫單詞列表
    return [tok.lower() for tok in listOfTokens if len(tok) > 2] 

# 垃圾郵件測試   
def spamTest():
    # 定義docList文件列表,classList類別列表,fullText所有文件詞彙
    docList=[]; classList = []; fullText =[]
    # 遍歷email/spam和email/ham下的txt檔案
    for i in range(1,26):
        # 定義並讀取垃圾郵件檔案的詞彙分割列表
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        # 將詞彙列表加到文件列表中
        docList.append(wordList)
        # 將所有詞彙列表彙總到fullText中
        fullText.extend(wordList)
        # 文件類別為1,spam
        classList.append(1)
        # 讀取非垃圾郵件的文件
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        # 新增到文件列表中
        docList.append(wordList)
        # 新增到所有詞彙列表中
        fullText.extend(wordList)
        # 類別為0,非垃圾郵件
        classList.append(0)
    # 建立詞彙列表
    vocabList = createVocabList(docList)
    # 定義訓練集的索引和測試集
    trainingSet = range(50); testSet=[]
    # 隨機的選擇10個作為測試集      
    for i in range(10):
        # 隨機索引
        randIndex = int(random.uniform(0,len(trainingSet)))
        # 將隨機選擇的文件加入到測試集中
        testSet.append(trainingSet[randIndex])
        # 從訓練集中刪除隨機選擇的文件
        del(trainingSet[randIndex])
    # 定義訓練集的矩陣和類別  
    trainMat=[]; trainClasses = []
    # 遍歷訓練集,求得先驗概率和條件概率
    for docIndex in trainingSet:
        # 將詞彙列表變為向量放到trainMat中
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        # 訓練集的類別標籤
        trainClasses.append(classList[docIndex])
    # 計算先驗概率,條件概率
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    # 定義錯誤計數
    errorCount = 0
    # 對測試集進行分類
    for docIndex in testSet: 
        # 將測試集詞彙向量化      
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        # 對測試資料進行分類
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            # 分類不正確,錯誤計數加1
            errorCount += 1
            print "classification error",docList[docIndex]
    # 輸出錯誤率
    print 'the error rate is: ',float(errorCount)/len(testSet)