【python與機器學習入門3】樸素貝葉斯2——垃圾郵件分類
阿新 • • 發佈:2018-12-12
參考部落格:樸素貝葉斯基礎篇之言論過濾器 (po主Jack-Cui,《——大部分內容轉載自
參考書籍:《機器學習實戰》——第四章4.6
1 資料集
ham資料夾 正常郵件
spam資料夾 垃圾郵件
2 資料處理
3 訓練
4 測試
#!/usr/bin/env python #_*_coding:utf-8_*_ from numpy import * import re ''' 文字分類-過濾垃圾郵件 ''' '''切分文字''' def textParse(bigString): listOfTokens = re.split(r'\W*',bigString) return [tok.lower() for tok in listOfTokens if len(tok)>2] '''建立詞彙表''' def createVocablist(dataset): vocabList = set([]) for data in dataset: vocabList = vocabList | set(data) return list(vocabList) '''詞彙轉向量''' def setOfWord2Vec(vocabList,dataset): returnVec = [0] * len(vocabList) for word in dataset: if word in vocabList: returnVec[vocabList.index(word)] = 1 #單詞出現標1 else: print "sorry this word %s is not in our vocablist" % word return returnVec '''計算p(wi|ci) 該類下,該單詞出現的概率 需要計算每一類每個單詞出現的次數(分子) 和 每一類出現過的總單詞數(分母) ''' def trainNB(trainMatrix,trainCategory): numTrainDocs = len(trainMatrix) numWords = len(trainMatrix[0]) pSpam = float(sum(trainCategory)) / float(numTrainDocs) p1Num = ones(numWords) p0Num = ones(numWords) #記錄每個單詞在該類出現的次數,是一個向量 p1Denom = 2.0 p0Denom = 2.0 #屬於該類的總單詞出現次數,是一個數 for i in range(numTrainDocs): if trainCategory[i] == 1: p1Num += trainMatrix[i] p1Denom += sum(trainMatrix[i]) else: p0Num += trainMatrix[i] p0Denom += sum(trainMatrix[i]) p1Vec = log(p1Num / p1Denom) p0Vec = log(p0Num / p0Denom) return p0Vec,p1Vec,pSpam '''計算文件屬於某個類別i的概率:p = p(w0|ci)p(w1|ci)p(w2|ci)~p(wn|ci)*p(ci)以下是二分類''' def classifyNB(vec2classify,p0vec,p1vec,pclass1): p1 = sum(vec2classify * p1vec) + log(pclass1) p0 = sum(vec2classify * p0vec) + log(1 - pclass1) if p1 > p0: #print "p1=%f" % p1 return 1 else: #print "p0=%f" % p0 return 0 ''' random.uniform(a, b),用於生成一個指定範圍內的隨機符點數 ''' def spamTest(): classList = [] ; docList = [] '''匯入檔案共50個''' for i in range(1,26): wordList = textParse(open('email/spam/%d.txt' % i,'r').read()) classList.append(1) docList.append(wordList) wordList = textParse(open('email/ham/%d.txt' % i, 'r').read()) classList.append(0) docList.append(wordList) '''建立詞彙表''' vocabList = createVocablist(docList) #print len(vocabList) #print vocabList trainSetIndex = range(50) testSetIndex = [] #print docList '''劃分訓練集和測試集 取10個測試集 40個訓練集''' for i in range(10): randIndex = int(random.uniform(0,len(trainSetIndex))) testSetIndex.append(trainSetIndex[randIndex])#避免重複 #print trainSetIndex[randIndex] del(trainSetIndex[randIndex]) numTrainDoc = len(trainSetIndex) numTestDoc = len(testSetIndex) trainMat = []; testMat = []; trainClass = []; testClass = []; '''訓練集資料向量化 資料集標籤''' for i in range(numTrainDoc): trainMat.append(setOfWord2Vec(vocabList,docList[trainSetIndex[i]])) trainClass.append(classList[trainSetIndex[i]]) p0Vec, p1Vec, pSpam = trainNB(trainMat,trainClass) errorCount = 0.0 '''測試集資料向量化 資料集標籤 進行測試''' for i in range(numTestDoc): testMat = setOfWord2Vec(vocabList,docList[testSetIndex[i]]) testClass = classList[testSetIndex[i]] classResult = classifyNB(testMat,p0Vec,p1Vec,pSpam) if classResult != testClass: print "classify wrong:origin %d" % testClass errorCount += 1 print "error rate = %.2f" % (errorCount / numTestDoc) if __name__ == '__main__': spamTest()