機器學習實戰第四章——樸素貝葉斯分類(原始碼解析)
阿新 • • 發佈:2019-01-07
樸素貝葉斯分類
#coding=utf-8 ''' Created on 2016年1月9日 @author: admin ''' from numpy import * # 載入資料集函式 def loadDataSet(): # 定義郵件列表 postingList = [['my','dog','has','flea',\ 'problem','help','please'], ['maybe','not','take','him',\ 'to','dog','park','stupid'], ['my','dalmation','is','so','cute',\ 'I','love','him'], ['stop','posting','stupid','worthless','garbage'], ['mr','licks','ate','my','steak','how',\ 'to','stop','him'], ['quit','buying','worthless','dog','food','stupid']] # 定義標籤向量,1——abusive,0——not abusive classVec = [0,1,0,1,0,1] return postingList,classVec # 建立詞彙列表 def createVocabList(dataSet): # 定義詞彙集 vocabSet = set([]) # 遍歷文件 for document in dataSet: # 將每個document合併到vocabSet,|用來聯合兩個集合 vocabSet = vocabSet | set(document) # 返回詞彙集 return list(vocabSet) # 把單詞轉換成向量 def setOfWords2Vec(vocabList,inputSet): # 定義要返回的向量 returnVec = [0] * len(vocabList) # 遍歷輸出集中的單詞 for word in inputSet: # 單詞在詞彙集中 if word in vocabList: # 對應的位置設為1 returnVec[vocabList.index(word)] = 1 else: print "the word: %s is not in my Vocabulary!" % word # 返回向量 return returnVec # 把單詞轉換成向量,用詞袋模型,計算詞出現的次數 def bagOfWords2VecMN(vocabList,inputSet): # 定義要返回的向量 returnVec = [0] * len(vocabList) # 遍歷輸出集中的單詞 for word in inputSet: # 單詞在詞彙集中 if word in vocabList: # 對應的詞出現次數 加1 returnVec[vocabList.index(word)] += 1 # 返回向量 return returnVec # 條件概率的計算 def trainNB0(trainMatrix,trainCategory): # 計算文件的數目 numTrainDocs = len(trainMatrix) # 計算單詞的數目 numWords = len(trainMatrix[0]) # 計算類別的概率,abusive為1,not abusive為0 pAbusive =sum(trainCategory) / float(numTrainDocs) # 初始化計數器,1行numWords列,p0是not abusive # p0Num =zeros(numWords) p0Num = ones(numWords) # 初始化計數器,p1是abusive p1Num = ones(numWords) # 初始化分母 p0Denom = 2.0 p1Denom = 2.0 # 遍歷文件 for i in range(numTrainDocs): # 計算abusive對應的詞彙的數目,trainMatrix為0-1值形成的向量 if trainCategory[i] == 1: # p1Num儲存的是每個詞出現的次數 p1Num += trainMatrix[i] # p1Denom儲存的是詞的總數目 p1Denom += sum(trainMatrix[i]) # 計算not abusive詞彙的數目 else: # 每個詞在not abusive下出現的次數 p0Num += trainMatrix[i] # not abusive下的總詞數 p0Denom += sum(trainMatrix[i]) # 計算abusive下每個詞出現的概率 # p1Vect = p1Num / p1Denom p1Vect = log(p1Num / p1Denom) # 計算not abusive下每個詞出現的概率 # p0Vect = p0Num / p0Denom p0Vect = log(p0Num / p0Denom) # 返回詞出現的概率和文件為abusive的概率,not abusive的概率為1-pAbusive return p0Vect,p1Vect,pAbusive def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1): # 計算abusive的概率 p1 = sum(vec2Classify * p1Vec) + log(pClass1) # 計算not abusive的概率 p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1) # 根據概率大小判斷屬於哪個類 if p1 > p0: return 1 else: return 0 # 測試 def testingNB(): # 載入資料集 listOPosts,listClass = loadDataSet() # 建立詞彙列表 myVocabList = createVocabList(listOPosts) trainMat = [] for postinDoc in listOPosts: trainMat.append(setOfWords2Vec(myVocabList,postinDoc)) p0V,p1V,pAb = trainNB0(array(trainMat),array(listClass)) # print p0V,p1V,pAb # print trainMat testEntry = ['love','my','dalmation'] thisDoc = array(setOfWords2Vec(myVocabList,testEntry)) print testEntry,'classified as:',classifyNB(thisDoc, p0V, p1V, pAb) testEntry = ['stupid','garbage'] thisDoc = array(setOfWords2Vec(myVocabList,testEntry)) print testEntry,'classified as:',classifyNB(thisDoc, p0V, p1V, pAb) # 文字解析 # 輸入是字串,輸出是單詞列表 def textParse(bigString): # 匯入正則表示式的包 import re # 用正則表示式分割字串 listOfTokens = re.split(r'\W*', bigString) # 返回小寫單詞列表 return [tok.lower() for tok in listOfTokens if len(tok) > 2] # 垃圾郵件測試 def spamTest(): # 定義docList文件列表,classList類別列表,fullText所有文件詞彙 docList=[]; classList = []; fullText =[] # 遍歷email/spam和email/ham下的txt檔案 for i in range(1,26): # 定義並讀取垃圾郵件檔案的詞彙分割列表 wordList = textParse(open('email/spam/%d.txt' % i).read()) # 將詞彙列表加到文件列表中 docList.append(wordList) # 將所有詞彙列表彙總到fullText中 fullText.extend(wordList) # 文件類別為1,spam classList.append(1) # 讀取非垃圾郵件的文件 wordList = textParse(open('email/ham/%d.txt' % i).read()) # 新增到文件列表中 docList.append(wordList) # 新增到所有詞彙列表中 fullText.extend(wordList) # 類別為0,非垃圾郵件 classList.append(0) # 建立詞彙列表 vocabList = createVocabList(docList) # 定義訓練集的索引和測試集 trainingSet = range(50); testSet=[] # 隨機的選擇10個作為測試集 for i in range(10): # 隨機索引 randIndex = int(random.uniform(0,len(trainingSet))) # 將隨機選擇的文件加入到測試集中 testSet.append(trainingSet[randIndex]) # 從訓練集中刪除隨機選擇的文件 del(trainingSet[randIndex]) # 定義訓練集的矩陣和類別 trainMat=[]; trainClasses = [] # 遍歷訓練集,求得先驗概率和條件概率 for docIndex in trainingSet: # 將詞彙列表變為向量放到trainMat中 trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex])) # 訓練集的類別標籤 trainClasses.append(classList[docIndex]) # 計算先驗概率,條件概率 p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses)) # 定義錯誤計數 errorCount = 0 # 對測試集進行分類 for docIndex in testSet: # 將測試集詞彙向量化 wordVector = bagOfWords2VecMN(vocabList, docList[docIndex]) # 對測試資料進行分類 if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]: # 分類不正確,錯誤計數加1 errorCount += 1 print "classification error",docList[docIndex] # 輸出錯誤率 print 'the error rate is: ',float(errorCount)/len(testSet)