1. 程式人生 > >Python利用樸素貝葉斯進行評分的分類

Python利用樸素貝葉斯進行評分的分類

利用樸素貝葉斯可以對文件進行分類,比如說進行垃圾郵件的過濾等接下來的案例是對評分進行分類的,經過學習判斷一句話應該屬於幾分,0-5分之間。
先利用爬蟲爬取樣本資料,該資料來自公開課的評論。
# coding=utf-8

import urllib2
from sgmllib import SGMLParser
import jieba


class CommentParser(SGMLParser):
    def __init__(self):
        SGMLParser.__init__(self)
        self.__start_table = False
self.__start_p = False self.__value_p = '' self.__value_div = '' self.__p_state = 0 self.data = [] def start_table(self, attr): for k, v in attr: if k == 'class' and v == 'table table-hover': self.__start_table = True def
end_table(self):
if self.__start_table: self.data.append([self.__value_p, self.__value_div]) self.__value_p = '' self.__value_div = '' self.__p_state = 0 self.__start_table = False def start_div(self, attr): if self.__start_table: for
k, v in attr: if k == 'data-score': self.__value_div = v def end_div(self): pass def start_p(self, attrs): if self.__start_table: self.__p_state += 1 self.__start_p = True def end_p(self): if self.__start_table: self.__start_p = False def handle_data(self, data): if self.__start_table and self.__start_p and self.__p_state == 3: self.__value_p += data def get_page(url): page = urllib2.urlopen(url).read() paraser = CommentParser() paraser.feed(page) value = paraser.data return value def download(): url = 'http://coursegraph.com/reviews/' for i in range(1, 9): value = get_page(url + str(i)) with open('result.txt', 'a+') as f: for row in value: f.write('[' + row[1] + ']' + row[0].strip().replace('\n', '').replace('\r', '') + '\n') def jieba_chn(): all_value = open('result.txt', 'r+').readlines() with open('result1.txt', 'w+') as f: for row in all_value: value = row[:5][1:4] jb = jieba.cut_for_search(row[5:]) for row in jb: if len(row) > 1: value += ',' + row f.write(value.encode('utf-8') + '\n') #下載資料 # download() #對文件資料進行分詞操作 jieba_chn()
很簡單的一個網路爬蟲,然後利用**結巴**分詞吧文件分割成片語,並去除掉標點符號等操作。具體的結果可以下載本案例進行檢視。
下面開始進行利用樸素貝葉斯進行分類操作
先讀取文件資料
def load_data_set():
    dataSet = []
    labels = []
    with open('result1.txt', 'r+') as f:
        for row in f.readlines():
            t = row.strip().replace('\n', '').split(',')
            labels.append(round(float(t[0]), 1))
            dataSet.append(t[1:])
    return dataSet, labels
建立一個單詞向量和對應的標籤
def create_vocab_list(dataSet, labels):
    vocabSet = []
    labelSet = []
    for index, document in enumerate(dataSet):
        vocabSet.extend(list(set(document)))
        labelSet.extend([labels[index] for i in range(len(set(document)))])
    return vocabSet, labelSet

根據單詞出現修改相應的標籤

def set_of_words2_vec(vocabList, label, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        for index, r in enumerate(vocabList):
            if r == word:
                returnVec[index] += label
    return returnVec
對資料驚醒學習操作,計算出片語出現的概率分佈
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    labelSet = list(set(trainCategory))
    pAbusive = {}
    for r in labelSet:
        pAbusive[str(r)] = len([row for row in trainCategory if row == r]) \
                           / float(numTrainDocs)
    pNumber = {}
    pDenom = {}
    for row in labelSet:
        pNumber[str(row)] = ones(numWords)
        pDenom[str(row)] = 2.0
    for i in range(numTrainDocs):
        pNumber[str(trainCategory[i])] += [row / trainCategory[i] for row in trainMatrix[i]]
        pDenom[str(trainCategory[i])] += sum(trainMatrix[i]) / trainCategory[i]

    ret = {}
    for i in range(len(labelSet)):
        ret[str(labelSet[i])] = pNumber[str(labelSet[i])] / pDenom[str(labelSet[i])]

    return ret, pAbusive
判斷測試片語的出現的概率,選擇出出現概率最高的一項,就是該片語的評分了。
def classifyNB(vec2Classify, pVec, pClass, trainCategory):
    labelSet = list(set(trainCategory))
    p = {}
    for row in labelSet:
        p[str(row)] = sum(vec2Classify * pVec[str(row)]) + log(pClass[str(row)])
    m = sorted(p.items(), key=lambda k: k[1], reverse=True)
    return float(m[0][0])
一下是對文件進行測試的操作,
def testingNB():
    dataSet, labels = load_data_set()
    vocabSet, labelSet = create_vocab_list(dataSet, labels)
    trainMatrix = []
    for index, row in enumerate(dataSet):
        trainMatrix.append(set_of_words2_vec(vocabSet, labels[index], row))
    pV, pAb = trainNB0(trainMatrix, labels)
    testEntry = ['學習', '很棒', '真不錯']
    testEntry = list(set(testEntry))
    thisDoc = array(set_of_words2_vec(vocabSet, 1, testEntry))
    print testEntry, 'classified as: ', classifyNB(thisDoc, pV, pAb, labels)


def test(number):
    '''
    驗證演算法的正確性
    :param number: 當成測試樣本的額百分比
    :return:
    '''
    dataSet, labels = load_data_set()
    test_number = int(len(dataSet) * number)
    testSet = []
    for i in range(test_number):
        randIndex = int(random.uniform(0, len(dataSet)))
        testSet.append([dataSet[randIndex], labels[randIndex]])
        del (dataSet[randIndex])
        del (labels[randIndex])
    # 進行學習
    vocabSet, labelSet = create_vocab_list(dataSet, labels)
    trainMatrix = []
    for index, row in enumerate(dataSet):
        trainMatrix.append(set_of_words2_vec(vocabSet, labels[index], row))
    pV, pAb = trainNB0(trainMatrix, labels)
    # 進行測試
    errorCount = 0
    for row in testSet:
        testEntry = row[0]
        testEntry = list(set(testEntry))
        thisDoc = array(set_of_words2_vec(vocabSet, 1, testEntry))
        ret = classifyNB(thisDoc, pV, pAb, labels)
        if ret != row[1]:
            print "classification error", row[1], ret
            errorCount += 1
    print 'the error rate is: ', float(errorCount) / len(testSet)


test(0.1)
# testingNB()
悲劇的是測試的結果很不理想,難道中文不能這樣分詞,還是那一個細節出現問題,還請大神指導下,權當學習一下吧!
[案例現在地址](http://download.csdn.net/detail/u010154424/9602826)