1. 程式人生 > >【機器學習六】貝葉斯NB

【機器學習六】貝葉斯NB

程式碼先貼上,後續總結

from numpy import *
# 過濾網站的惡意留言 侮辱性:1   非侮辱性:0
# 建立一個實驗樣本
def loadDataSet():
  postingList = [['my','dog','has','flea','problems','help','please'],
          ['maybe','not','take','him','to','dog','park','stupid'],
          ['my','dalmation','is','so','cute','I','love','him'],
          ['stop','posting','stupid','worthless','garbage'],
          ['mr','licks','ate','my','steak','how','to','stop','him'],
          ['quit','buying','worthless','dog','food','stupid']]
  classVec = [0,1,0,1,0,1]
  return postingList, classVec
# 建立一個包含在所有文件中出現的不重複詞的列表
def createVocabList(dataSet):
  vocabSet = set([])   # 建立一個空集
  for document in dataSet:
    vocabSet = vocabSet | set(document)  # 建立兩個集合的並集
  return list(vocabSet)
# 將文件詞條轉換成詞向量
def setOfWords2Vec(vocabList, inputSet):
  returnVec = [0]*len(vocabList)    # 建立一個其中所含元素都為0的向量
  for word in inputSet:
    if word in vocabList:
      # returnVec[vocabList.index(word)] = 1   # index函式在字串裡找到字元第一次出現的位置 詞集模型
      returnVec[vocabList.index(word)] += 1   # 文件的詞袋模型  每個單詞可以出現多次
    else: print( "the word: %s is not in my Vocabulary!" % word)
  return returnVec
# 樸素貝葉斯分類器訓練函式  從詞向量計算概率
def trainNB0(trainMatrix, trainCategory):
  
  numTrainDocs = len(trainMatrix)
  numWords = len(trainMatrix[0])
  pAbusive = sum(trainCategory)/float(numTrainDocs)
  # p0Num = zeros(numWords); p1Num = zeros(numWords)
  # p0Denom = 0.0; p1Denom = 0.0
  p0Num = ones(numWords);  # 避免一個概率值為0,最後的乘積也為0
  p1Num = ones(numWords);  # 用來統計兩類資料中,各詞的詞頻
  p0Denom = 2.0; # 用於統計0類中的總數
  p1Denom = 2.0 # 用於統計1類中的總數
  for i in range(numTrainDocs):
    if trainCategory[i] == 1:
      p1Num += trainMatrix[i]
      p1Denom += sum(trainMatrix[i])
    else:
      p0Num += trainMatrix[i]
      p0Denom += sum(trainMatrix[i])
      # p1Vect = p1Num / p1Denom
      # p0Vect = p0Num / p0Denom
  print(p1Num)
  print(p1Denom)
  p1Vect = log(p1Num / p1Denom)  # 在類1中,每個次的發生概率
  print(p1Vect)
  p0Vect = log(p0Num / p0Denom)   # 避免下溢位或者浮點數舍入導致的錯誤  下溢位是由太多很小的數相乘得到的
  return p0Vect, p1Vect, pAbusive
# 樸素貝葉斯分類器
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
#   print("---------")
#   print(vec2Classify)
#   print(p1Vec)
  print(vec2Classify*p1Vec)
  p1 = sum(vec2Classify*p1Vec) + log(pClass1)
  p0 = sum(vec2Classify*p0Vec) + log(1.0-pClass1)
  if p1 > p0:
    return 1
  else:
    return 0
def testingNB():
  listOPosts, listClasses = loadDataSet()
  myVocabList = createVocabList(listOPosts)
  trainMat = []
  for postinDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
  p0V, p1V, pAb = trainNB0(array(trainMat), array(listClasses))
  testEntry = ['love']*101
  thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
  print( testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
  testEntry = ['stupid','garbage']
  thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
  print (testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
# 呼叫測試方法----------------------------------------------------------------------
testingNB()