樸素貝葉斯Python實現
阿新 • • 發佈:2019-01-10
貝葉斯定理:
from math import *
from numpy import *
import random
建立資料集和標籤 def loadData(): postingList=[['my','dog','has','flea','problems','help','please'], ['maybe','not','take','him','to','dog','park','stupid'], ['my','dalmation','is','so','cute','I','love','him'], ['stop','posting','stupid','worthless','garbage'], ['mr','licks','ate','my','steak','how','to','stop','him'], ['quit','buying','worthless','dog','food','stupid']] classVec=[0,1,0,1,0,1]#0是正常詞彙,1是具有侮辱性的詞彙 return postingList,classVec
根據上面的資料集建立詞彙庫
輸入即是上面的資料集postingList
輸出無重複的詞彙庫
def createVocaList(dataset):
vocabset=set([])
for document in dataset:
vocabset=vocabset|set(document)#union交集,除去重複性詞彙
return list(vocabset)
將單個樣本隱射到詞彙庫中,統計單個樣本在詞庫中的出現情況 def setOfWords2Vec(vocabList,inputset): returnVec=[0]*len(vocabList)#建立一個與詞彙庫長度相同的0向量表 for word in inputset: if word in vocabList: returnVec[vocabList.index(word)]=1#詞彙表中出現過的詞彙記1 else: print("the word:{0} is not in my vocabulary".format(word)) return returnVec
def trainNB0(trainMatrix,trainCategory): numTrainDocs=len(trainMatrix)#統計訓練樣本的大小此處numTrainDocs=6 numWords=len(trainMatrix[0])#統計樣本中第一個文件中包含的詞彙個數 pAbusive=sum(trainCategory)/float(numTrainDocs)#計算侮辱性詞彙的概率 p0Num=ones(numWords)#初始樣本個數為1,防止條件概率為0 p1Num=ones(numWords) p0Denom=2.0#初始樣本個數為2,防止條件概率為0 p1Denom=2.0 for i in range(numTrainDocs): if trainCategory[i]==1:#計算類別為1的詞彙出現概率 p1Num+=trainMatrix[i]#當某一文件中出現1時,p1Num加1 p1Denom+=sum(trainMatrix[i])#同時,在整個訓練樣本中類別為1的詞彙數+1 else: p0Num+=trainMatrix[i] p1Denom+=sum(trainMatrix[i]) p1Vect=p1Num/p1Denom p1Vect=[log(x) for x in p1Vect]#p1Vect表示條件概率P(Wi|c=1) p0Vect=p0Num/p0Denom p0Vect=[log(x) for x in p0Vect]#p0Vect表示條件概率P(Wi|c=0) return p0Vect,p1Vect,pAbusive
訓練貝葉斯分類演算法
def classifyNB(vec2classify,p0Vec,p1Vec,pclass1):
p1=sum(vec2classify*p1Vec)+log(pclass1)
p0=sum(vec2classify*p0Vec)+log(1.0-pclass1)
if p1>p0:
return 1
else:
return 0
其中p1和p0表示的是lnp(w1|c=1)p(w2|c=1)...p(wn|c=1)∗p(c=1)lnp(w1|c=1)p(w2|c=1)...p(wn|c=1)∗p(c=1)和lnp(w1|c=0)p(w2|c=0)...p(wn|c=0)∗p(c=0)lnp(w1|c=0)p(w2|c=0)...p(wn|c=0)∗p(c=0),取對數是因為防止p(w_1|c=1)p(w_2|c=1)p(w_3|c=1)…p(w_n|c=1)多個小於1的數相乘結果值下溢。
測試分類模型
def testingNB():
listOPosts,listclass=loadData()#載入資料集和類標號
myVocabList=createVocaList(listOPosts)#建立詞庫
trainMat=[]
for postinDoc in listOPosts:#計算詞庫中的每個樣本的出現情況
trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
p0V, p1V, pAb = trainNB0(numpy.array(trainMat), numpy.array(listclass))#計算條件概率
testEntry = ['love', 'my', 'dalmation']
thisDoc = numpy.array(setOfWords2Vec(myVocabList, testEntry))
print("testEntry classified as:{0}".format(classifyNB(thisDoc, p0V, p1V, pAb)))
testEntry = ['stupid', 'garbage','quit']
thisDoc = numpy.array(setOfWords2Vec(myVocabList, testEntry))
print("testEntry classified as:{0}".format(classifyNB(thisDoc, p0V, p1V, pAb)))
利用樸素貝葉斯進行垃圾郵件測試
def textParse(bigstring):
import re#匯入正則式
listOfTokens=re.split(r'\w*',bigstring)
return [tok.lower() for tok in listOfTokens if len(tok)>2]#去掉少於兩個字元的字串,,並將所有的字串轉換為小寫
def spamTest():
docList=[]
classList=[]
fullText=[]
for i in range(1,26):
wordList=textParse(open('spam/%d.txt'%i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList=textParse(open('ham/%d.txt'%i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList=createVocaList(docList)
trainingSet=list(range(50))
testSet=[]
for i in range(10):#隨機選擇十封郵件 作為測試集
randIndex=int(random.uniform(0,len(trainingSet)))#隨機選擇十個數字,數字對應的文件被新增到測試集
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])#將選中的文件從訓練集中刪除
trainMat=[]
trainClasses=[]
for docIndex in trainingSet:
trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam=trainNB0(array(trainMat),array(trainClasses))
errorCount=0
for docIndex in testSet:
wordVector=setOfWords2Vec(vocabList,docList[docIndex])
if classifyNB(array(wordVector),p0V,p1V,pSpam)!=classList[docIndex]:
errorCount+=1
print("the error rate is {0}".format(float(errorCount)/len(testSet)))