Python機器學習與實戰筆記之樸素貝葉斯分類
阿新 • • 發佈:2018-12-30
1聯合概率分佈
p(x,y)=p(y)P(x|y) 或者p(A交B)=p(A)xp(B) p(A交B)不容易求,假設條件獨立拆分成兩個事件的乘積
2基本假設條件獨立性
3利用貝葉斯定理 p(y|x)=P(x,y)/p(x)=p(y)P(x|y)/sum(y-i)[p(y)P(x|y)]
y=max p(y)P(x|y)
貝葉斯決策理論要求計算兩個概率p1(x,y),p2(x, y):
如果p1(x,y) > p2 (x, y) , 那麼屬於類別1
如果p2(x, y) > pl(x, y) , 那麼屬於類別2
拉普拉斯估計
每一個似然函式 分子+1對分母加上分子中加上1的總數
在樸素貝葉斯使用數值特徵採用數值特徵離散化,找見資料分佈分割點切分 樸素貝葉斯分類器通常有兩種實現方式:一種基於貝努利模型實現, 一種基於多項式模型實現
import sys
sys.path.append("G:/python/pythonwork/ML")
p(x,y)=p(y)P(x|y) 或者p(A交B)=p(A)xp(B) p(A交B)不容易求,假設條件獨立拆分成兩個事件的乘積
2基本假設條件獨立性
3利用貝葉斯定理 p(y|x)=P(x,y)/p(x)=p(y)P(x|y)/sum(y-i)[p(y)P(x|y)]
y=max p(y)P(x|y)
貝葉斯決策理論要求計算兩個概率p1(x,y),p2(x, y):
如果p1(x,y) > p2 (x, y) , 那麼屬於類別1
如果p2(x, y) > pl(x, y) , 那麼屬於類別2
拉普拉斯估計
每一個似然函式 分子+1對分母加上分子中加上1的總數
在樸素貝葉斯使用數值特徵採用數值特徵離散化,找見資料分佈分割點切分 樸素貝葉斯分類器通常有兩種實現方式:一種基於貝努利模型實現, 一種基於多項式模型實現
這裡採用前一種實現方式。該實現方式中並不考慮詞在文件中出現的次數, 只考慮出不出現,
因此在這個意義上相當於假設詞是等權重的
匯入指定目錄下的py檔案,先匯入路徑,後引入檔案import sys
sys.path.append("G:/python/pythonwork/ML")
import bayes
Python版實現
http://blog.csdn.net/q383700092/article/details/51773364
R語言版呼叫函式
http://blog.csdn.net/q383700092/article/details/51774069
MapReduce簡化實現版
http://blog.csdn.net/q383700092/article/details/51778765
spark版
後續新增
垃圾郵件分類示例
#coding:utf-8 from numpy import * #建立了一些實驗樣本 #postingList,classVec=bayes.loadDataSet() def loadDataSet(): postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']] classVec = [0, 1, 0, 1, 0, 1] return postingList, classVec # 建立一個包含在所有文件中出現的不重複詞的列表 #wordlist=bayes.createVocabList(postingList) def createVocabList(dataSet): vocabSet = set([]) # 建立一個空集 for document in dataSet: vocabSet = vocabSet | set(document) # 建立兩個集合的並集 return list(vocabSet) # 將文件詞條轉換成詞向量 #每個詞的出現與否作為一個特徵,被描述為詞集模型setOfWordsmodel #wordVec=bayes.setOfWords2Vec(wordlist,postingList[0]) def setOfWords2Vec(vocabList, inputSet): returnVec = [0] * len(vocabList) # 建立一個其中所含元素都為0的向量 for word in inputSet: if word in vocabList: returnVec[vocabList.index(word)] = 1 #index函式在字串裡找到字元第一次出現的位置 詞集模型 #returnVec[vocabList.index(word)] += 1 # 文件的詞袋模型 每個單詞可以出現多次 else: print "the word: %s is not in my Vocabulary!" % word return returnVec #樸素貝葉斯分類器訓練函式 從詞向量計算概率 #trainMatrix=[] #for postinDoc in postingList: # trainMatrix.append(bayes.setOfWords2Vec(wordlist,postinDoc)) #p0Vect, p1Vect, pAbusive=bayes.trainNB0(trainMatrix,classVec) def trainNB0(trainMatrix, trainCategory): numTrainDocs = len(trainMatrix) #輸入的樣本數 numWords = len(trainMatrix[0]) #每個樣本的詞的總數 pAbusive = sum(trainCategory)/float(numTrainDocs) #侮辱詞條所佔比例 # p0Num = zeros(numWords); p1Num = zeros(numWords) #p0Denom = 0.0; p1Denom = 0.0 p0Num = ones(numWords); p1Num = ones(numWords) #避免一個概率值為0,最後的乘積也為0 p0Denom = 2.0; p1Denom = 2.0 #將所有詞的出現數初始化為1,並將分母初始化為2 for i in range(numTrainDocs): if trainCategory[i] == 1: #判讀是否是侮辱詞 p1Num += trainMatrix[i] #每個詞出現的個數 p1Denom += sum(trainMatrix[i]) #詞出現的總數 else: p0Num += trainMatrix[i] p0Denom += sum(trainMatrix[i]) # p1Vect = p1Num / p1Denom #p0Vect = p0Num / p0Denom p1Vect = log(p1Num / p1Denom) p0Vect = log(p0Num / p0Denom) #避免下溢位或者浮點數舍入導致的錯誤 下溢位是由太多很小的數相乘得到的 return p0Vect, p1Vect, pAbusive #樸素貝葉斯分類器 def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1): p1 = sum(vec2Classify*p1Vec) + log(pClass1) p0 = sum(vec2Classify*p0Vec) + log(1.0-pClass1) if p1 > p0: return 1 else: return 0 #測試樸素貝葉斯分類器 def testingNB(): listOPosts,listClasses = loadDataSet() myVocabList = createVocabList(listOPosts) trainMat=[] for postinDoc in listOPosts: trainMat.append(setOfWords2Vec(myVocabList, postinDoc)) p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses)) testEntry = ['love', 'my', 'dalmation'] thisDoc = array(setOfWords2Vec(myVocabList, testEntry)) print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb) testEntry = ['stupid', 'garbage'] thisDoc = array(setOfWords2Vec(myVocabList, testEntry)) print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb) #在詞袋中,每個單詞可以出現多次 def bagOfWords2VecMN(vocabList, inputSet): returnVec = [0]*len(vocabList) for word in inputSet: if word in vocabList: returnVec[vocabList.index(word)] += 1 return returnVec #文字切分 def textParse(bigString): # input is big string, #output is word list import re listOfTokens = re.split(r'\W*', bigString) return [tok.lower() for tok in listOfTokens if len(tok) > 2] #對貝葉斯垃圾郵件分類器進行自動化處理 #bayes.spamTest() def spamTest(): docList = []; classList = []; fullText = [] for i in range(1, 26): wordList = textParse(open('G:/python/pythonwork/email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('G:/python/pythonwork/email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = createVocabList(docList) # create vocabulary trainingSet = range(50); testSet = [] # create test set for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = []; trainClasses = [] for docIndex in trainingSet: # train the classifier (get probs) trainNB0 trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses)) errorCount = 0 for docIndex in testSet: # classify the remaining items wordVector = bagOfWords2VecMN(vocabList, docList[docIndex]) if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print "classification error", docList[docIndex] print 'the error rate is: ', float(errorCount) / len(testSet) # return vocabList,fullText #計算頻率 def calcMostFreq(vocabList, fullText): import operator freqDict = {} for token in vocabList: freqDict[token] = fullText.count(token) sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True) return sortedFreq[:30] #calcMostFreq該函式遍歷詞彙表中的每個詞並統計它在文字中出現的次數,然 #後根據出現次數從高到低對詞典進行排序,最後返回排序最高的100個單詞 def localWords(feed1, feed0): import feedparser docList = []; classList = []; fullText = [] minLen = min(len(feed1['entries']), len(feed0['entries'])) for i in range(minLen): wordList = textParse(feed1['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(1) # NY is class 1 wordList = textParse(feed0['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = createVocabList(docList) # create vocabulary top30Words = calcMostFreq(vocabList, fullText) # remove top 30 words for pairW in top30Words: if pairW[0] in vocabList: vocabList.remove(pairW[0]) trainingSet = range(2 * minLen); testSet = [] # create test set for i in range(20): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = []; trainClasses = [] for docIndex in trainingSet: # train the classifier (get probs) trainNB0 trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses)) errorCount = 0 for docIndex in testSet: # classify the remaining items wordVector = bagOfWords2VecMN(vocabList, docList[docIndex]) if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print 'the error rate is: ', float(errorCount) / len(testSet) return vocabList, p0V, p1V #返回排名最高的x個不同單詞 def getTopWords(ny, sf): import operator vocabList, p0V, p1V = localWords(ny, sf) topNY = []; topSF = [] for i in range(len(p0V)): if p0V[i] > -6.0: topSF.append((vocabList[i], p0V[i])) if p1V[i] > -6.0: topNY.append((vocabList[i], p1V[i])) sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True) print "SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**" for item in sortedSF: print item[0] sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True) print "NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**" for item in sortedNY: print item[0]