Python利用樸素貝葉斯進行評分的分類
阿新 • • 發佈:2018-12-28
利用樸素貝葉斯可以對文件進行分類,比如說進行垃圾郵件的過濾等接下來的案例是對評分進行分類的,經過學習判斷一句話應該屬於幾分,0-5分之間。
先利用爬蟲爬取樣本資料,該資料來自公開課的評論。
# coding=utf-8
import urllib2
from sgmllib import SGMLParser
import jieba
class CommentParser(SGMLParser):
def __init__(self):
SGMLParser.__init__(self)
self.__start_table = False
self.__start_p = False
self.__value_p = ''
self.__value_div = ''
self.__p_state = 0
self.data = []
def start_table(self, attr):
for k, v in attr:
if k == 'class' and v == 'table table-hover':
self.__start_table = True
def end_table(self):
if self.__start_table:
self.data.append([self.__value_p, self.__value_div])
self.__value_p = ''
self.__value_div = ''
self.__p_state = 0
self.__start_table = False
def start_div(self, attr):
if self.__start_table:
for k, v in attr:
if k == 'data-score':
self.__value_div = v
def end_div(self):
pass
def start_p(self, attrs):
if self.__start_table:
self.__p_state += 1
self.__start_p = True
def end_p(self):
if self.__start_table:
self.__start_p = False
def handle_data(self, data):
if self.__start_table and self.__start_p and self.__p_state == 3:
self.__value_p += data
def get_page(url):
page = urllib2.urlopen(url).read()
paraser = CommentParser()
paraser.feed(page)
value = paraser.data
return value
def download():
url = 'http://coursegraph.com/reviews/'
for i in range(1, 9):
value = get_page(url + str(i))
with open('result.txt', 'a+') as f:
for row in value:
f.write('[' + row[1] + ']' + row[0].strip().replace('\n', '').replace('\r', '') + '\n')
def jieba_chn():
all_value = open('result.txt', 'r+').readlines()
with open('result1.txt', 'w+') as f:
for row in all_value:
value = row[:5][1:4]
jb = jieba.cut_for_search(row[5:])
for row in jb:
if len(row) > 1:
value += ',' + row
f.write(value.encode('utf-8') + '\n')
#下載資料
# download()
#對文件資料進行分詞操作
jieba_chn()
很簡單的一個網路爬蟲,然後利用**結巴**分詞吧文件分割成片語,並去除掉標點符號等操作。具體的結果可以下載本案例進行檢視。
下面開始進行利用樸素貝葉斯進行分類操作
先讀取文件資料
def load_data_set():
dataSet = []
labels = []
with open('result1.txt', 'r+') as f:
for row in f.readlines():
t = row.strip().replace('\n', '').split(',')
labels.append(round(float(t[0]), 1))
dataSet.append(t[1:])
return dataSet, labels
建立一個單詞向量和對應的標籤
def create_vocab_list(dataSet, labels):
vocabSet = []
labelSet = []
for index, document in enumerate(dataSet):
vocabSet.extend(list(set(document)))
labelSet.extend([labels[index] for i in range(len(set(document)))])
return vocabSet, labelSet
根據單詞出現修改相應的標籤
def set_of_words2_vec(vocabList, label, inputSet):
returnVec = [0] * len(vocabList)
for word in inputSet:
for index, r in enumerate(vocabList):
if r == word:
returnVec[index] += label
return returnVec
對資料驚醒學習操作,計算出片語出現的概率分佈
def trainNB0(trainMatrix, trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
labelSet = list(set(trainCategory))
pAbusive = {}
for r in labelSet:
pAbusive[str(r)] = len([row for row in trainCategory if row == r]) \
/ float(numTrainDocs)
pNumber = {}
pDenom = {}
for row in labelSet:
pNumber[str(row)] = ones(numWords)
pDenom[str(row)] = 2.0
for i in range(numTrainDocs):
pNumber[str(trainCategory[i])] += [row / trainCategory[i] for row in trainMatrix[i]]
pDenom[str(trainCategory[i])] += sum(trainMatrix[i]) / trainCategory[i]
ret = {}
for i in range(len(labelSet)):
ret[str(labelSet[i])] = pNumber[str(labelSet[i])] / pDenom[str(labelSet[i])]
return ret, pAbusive
判斷測試片語的出現的概率,選擇出出現概率最高的一項,就是該片語的評分了。
def classifyNB(vec2Classify, pVec, pClass, trainCategory):
labelSet = list(set(trainCategory))
p = {}
for row in labelSet:
p[str(row)] = sum(vec2Classify * pVec[str(row)]) + log(pClass[str(row)])
m = sorted(p.items(), key=lambda k: k[1], reverse=True)
return float(m[0][0])
一下是對文件進行測試的操作,
def testingNB():
dataSet, labels = load_data_set()
vocabSet, labelSet = create_vocab_list(dataSet, labels)
trainMatrix = []
for index, row in enumerate(dataSet):
trainMatrix.append(set_of_words2_vec(vocabSet, labels[index], row))
pV, pAb = trainNB0(trainMatrix, labels)
testEntry = ['學習', '很棒', '真不錯']
testEntry = list(set(testEntry))
thisDoc = array(set_of_words2_vec(vocabSet, 1, testEntry))
print testEntry, 'classified as: ', classifyNB(thisDoc, pV, pAb, labels)
def test(number):
'''
驗證演算法的正確性
:param number: 當成測試樣本的額百分比
:return:
'''
dataSet, labels = load_data_set()
test_number = int(len(dataSet) * number)
testSet = []
for i in range(test_number):
randIndex = int(random.uniform(0, len(dataSet)))
testSet.append([dataSet[randIndex], labels[randIndex]])
del (dataSet[randIndex])
del (labels[randIndex])
# 進行學習
vocabSet, labelSet = create_vocab_list(dataSet, labels)
trainMatrix = []
for index, row in enumerate(dataSet):
trainMatrix.append(set_of_words2_vec(vocabSet, labels[index], row))
pV, pAb = trainNB0(trainMatrix, labels)
# 進行測試
errorCount = 0
for row in testSet:
testEntry = row[0]
testEntry = list(set(testEntry))
thisDoc = array(set_of_words2_vec(vocabSet, 1, testEntry))
ret = classifyNB(thisDoc, pV, pAb, labels)
if ret != row[1]:
print "classification error", row[1], ret
errorCount += 1
print 'the error rate is: ', float(errorCount) / len(testSet)
test(0.1)
# testingNB()
悲劇的是測試的結果很不理想,難道中文不能這樣分詞,還是那一個細節出現問題,還請大神指導下,權當學習一下吧!
[案例現在地址](http://download.csdn.net/detail/u010154424/9602826)