樸素貝葉斯-過濾垃圾郵件程式碼例項詳解

阿新 • • 發佈：2019-01-05

1.問題描述

過濾垃圾郵件

2.思考過程

（1）收集資料：提供文字檔案

（2）準備資料：將文字檔案解析成詞條向量

此處我們需要從給予的文字文件中構建自己的詞列表（將文字內容進行詞分割，過濾不需要的），也就是要建立符合實際情況的文字解析規則和過濾器（此處發現python對這方面的支援太好用，用其他語言比如c++當然也可以），程式碼實現中可以為此單獨寫一個函式

（3）分析資料：檢查詞條確保解析的正確性

（4）訓練演算法：使用訓練函式進行訓練（其實此處就是統計詞條出現頻率，然後採用極大似然估計或貝葉斯估計等估計法方法用統計的頻率估算概率，總體來說，就是從詞向量計算概率）

採用貝葉斯公式

（5）測試演算法：使用訓練好的分類器來進行結果分類，並構建測試函式計算文件集的錯誤率
（6）使用演算法：構建完整程式對輸入文件進行分類

3.程式碼例項和講解

/*
* code list 4-1 : transfer func from docs list to vocabulary list
* code list 4-2 : training func on Naive Bayes Classifier
* code list 4-3 : naive bayes classify function
* add code list 4-4 : naive bayes bag-of-word model
* add code list 4-5 : text parse : textParse.py and spam email test function : get_error_rate()
* */

#include<iostream>  
#include<map>  
#include<set>  
#include<cmath>  
#include<vector>  
#include<algorithm>  
#include<numeric>  
#include<cstring>  
#include<string>
#include<stdio.h>  
#include<cstdlib>  
#include<fstream>  
#include<stdlib.h>  
//#include<unistd.h>  
#include<string.h>  
using namespace std;

class NaiveBayes         //貝葉斯分類器
{
private:
	vector< vector<string> > list_of_docs;
	vector<int> list_classes;   //類別向量
	map<string, int>  my_vocab_list;
	int *return_vec;
	vector< vector<int> > train_mat;  //訓練矩陣
	vector<float> p0vect;
	vector<float> p1vect;
	float p_abusive;
	ifstream fin;
	ofstream fout;
	int test_data_num;

public:
	NaiveBayes()
	{
		cout << "please input the num of test data which should be less than 24 : " << endl;
		cin >> test_data_num;
		vector<string> vec;
		string word;
		string filename;
		char buf[3];
		string buf_str;
		//分類器類初始化，執行解析指令碼，將分割後的詞向量存入list_of_docs，list_of_docs中的每一個元素對應一個檔案內容
		for (int i = test_data_num + 1; i <= 25; i++)
		{
			sprintf(buf, "%d", i);  //convert digit to string  
			vec.clear();
			buf_str = buf;
			filename = "./email/hamParse/" + buf_str + ".dat";
			//cout<<"filename : "<<filename<<endl;  
			fin.open(filename.c_str());
			if (!fin)
			{
				cerr << "open the file " << filename << " error" << endl;
				exit(1);
			}
			while (fin >> word)
			{
				vec.push_back(word);
			}
			list_of_docs.push_back(vec);
			list_classes.push_back(0);
			filename.clear();
			fin.close();
		}

		for (int i = test_data_num + 1; i <= 25; i++)
		{
			sprintf(buf, "%d", i);
			vec.clear();
			buf_str = buf;
			filename = "./email/spamParse/" + buf_str + ".dat";
			//cout<<"filename : "<<filename<<endl;  
			fin.open(filename.c_str());
			if (!fin)
			{
				cerr << "open the file " << filename << " error" << endl;
			}
			while (fin >> word)
			{
				vec.push_back(word);
			}
			list_of_docs.push_back(vec);
			list_classes.push_back(1);
			filename.clear();
			fin.close();
		}

	}

	~NaiveBayes()
	{
		fin.close();
		fout.close();
		list_of_docs.clear();
		list_classes.clear();
		my_vocab_list.clear();
		train_mat.clear();
		//delete [] return_vec;  
		p0vect.clear();
		p1vect.clear();
	}


	void create_vocab_list()
	{
		vector< vector<string> > ::iterator it = list_of_docs.begin();
		int index = 1;
		while (it != list_of_docs.end())
		{
			//vector<string> vec( *it.begin(),*it.end() );  
			vector<string> vec = *it;

			vector<string> ::iterator tmp_it = vec.begin();

			while (tmp_it != vec.end())
			{
				//cout<<*tmp_it<<" ";  
				if (my_vocab_list[*tmp_it] == 0)
				{
					my_vocab_list[*tmp_it] = index++; //index is the location of the vovabulary  
				}
				tmp_it++;
			}
			it++;
		}

	}//create_vocab_list  

	//set some one word to vec with 0 and 1.  
	void beg_of_words_to_vec(int idx)
	{
		//cout<<"set of words to vec begin the document id is : "<<idx<<endl;  
		int len = my_vocab_list.size() + 1;
		return_vec = new int[len](); //pay attention to the difference between "new int[len]". initalize all the element to zero.  
		fill(return_vec, return_vec + len, 0);
		vector< vector<string> >::iterator it = list_of_docs.begin() + idx - 1;
		vector<string> vec = *it;
		vector<string> ::iterator itt = vec.begin();
		int pos = 0;
		while (itt != vec.end())
		{
			//          cout<<*itt<<" ";  
			pos = my_vocab_list[*itt];
			if (pos != 0)
			{
				return_vec[pos] += 1;
			}
			itt++;
		}
	}//beg_of_words_to_vec  

	void get_train_matrix()
	{
		cout << "get train matrix begin : " << endl;
		train_mat.clear();
		for (int i = 1; i <= list_of_docs.size(); i++)
		{
			beg_of_words_to_vec(i);
			vector<int> vec(return_vec, return_vec + my_vocab_list.size() + 1);
			train_mat.push_back(vec);
			delete[]return_vec;
		}
	}//get train matrix  

	void print()
	{
		cout << "print the train matrix begin : " << endl;
		vector< vector<int> > ::iterator it = train_mat.begin();
		while (it != train_mat.end())
		{
			vector<int> vec = *it;
			vector<int> ::iterator itt = vec.begin();
			while (itt != vec.end())
			{
				cout << *itt << " ";
				itt++;
			}
			cout << endl;
			it++;
		}

	}//print()  

	void train_NB0()
	{
		int num_train_docs = train_mat.size();//sizeof(docs_lists)/sizeof(docs_lists[0]);  
		cout << "num_train_docs = " << num_train_docs << endl;
		int num_words = train_mat[0].size() - 1;
		/* calculatr the sum of the abusive classes */
		int sum = accumulate(list_classes.begin(), list_classes.end(), 0);
		cout << "sum = " << sum << endl;
		//float p_abusive = (float)sum/(float)num_train_docs;  
		p_abusive = (float)sum / (float)num_train_docs;
		cout << "p_abusive = " << p_abusive << endl;

		//vector<float> p0vect(train_mat[0].size(),1); //the frequency of each word in non-absusive docs  
		p0vect.resize(train_mat[0].size(), 1);  //先將所有單詞出現次數初始化為nameda = 1,拉普拉斯平滑，避免有概率值為0
		//vector<float> p1vect(train_mat[0].size(),1); //the frequency of each word in abusive docs  
		p1vect.resize(train_mat[0].size(), 1);
		printf("p0num.size() = %d , p1num.size() = %d\n", p0vect.size(), p1vect.size());
		float p0Denom = 2.0; //the total number of words in non-abusive docs，初始化為類別總計2
		float p1Denom = 2.0; //the total number of words in abusive docs  

		/* calculate the p0num,p1num,p0Denom,p1Denom */
		for (int i = 0; i<list_classes.size(); i++)
		{
			if (list_classes[i] == 1)  //abusive doc  
			{
				for (int j = 0; j<p1vect.size(); j++)
				{////有爭議的是此處地方,對於概率分母p1Denom和p0Denom的求解暫且不明白
					p1vect[j] += train_mat[i][j];
					if (train_mat[i][j] == 1)
						p1Denom += 1;
				}
			}
			else   //non-abusive doc  
			{
				for (int j = 0; j<p0vect.size(); j++)
				{
					p0vect[j] += train_mat[i][j];
					if (train_mat[i][j] == 1)
						p0Denom += 1;
				}
			}
		}

		for (int i = 0; i<p1vect.size(); i++)
		{
			p0vect[i] = log(p0vect[i] / p0Denom);    //計算先驗概率
			p1vect[i] = log(p1vect[i] / p1Denom);    
		}

		cout << endl;
	}

	int classify_NB(const char  *filename)
	{
		return_vec = new int[my_vocab_list.size() + 1]();

		fin.open(filename);
		if (!fin)
		{
			cerr << "fail to open the file " << filename << endl;
			exit(1);
		}
		string word;
		while (fin >> word)
		{
			int pos = my_vocab_list[word];
			if (pos != 0)
			{
				return_vec[pos] += 1;
			}
		}
		fin.close();

		cout << endl;
		float p1 = inner_product(p1vect.begin() + 1, p1vect.end(), return_vec + 1, 0) + log(p_abusive);
		float p0 = inner_product(p0vect.begin() + 1, p0vect.end(), return_vec + 1, 0) + log(1 - p_abusive);

		cout << "p1 = " << p1 << "  " << "p0 = " << p0 << endl;

		if (p1>p0)
		{
			return 1;
		}
		else
		{
			return 0;
		}
	}

	void get_error_rate()
	{
		string filename;
		char buf[3];
		string buf_str;
		int error_count = 0;
		for (int i = 1; i <= test_data_num; i++)
		{
			sprintf(buf, "%d", i);
			buf_str = buf;
			filename = "./email/hamParse/" + buf_str + ".dat";
			if (classify_NB(filename.c_str()) != 0)
			{
				error_count++;
			}

			filename = "./email/spamParse/" + buf_str + ".dat";
			if (classify_NB(filename.c_str()) != 1)
			{
				error_count++;
			}
		}
		cout << "the error rate is : " << (float)error_count / (float)(2 * test_data_num) << endl;

	}
};

int main()
{
	NaiveBayes nb;        //文字分割，存入vector< vector<string> > list_of_docs，list_of_docs中的每一項都表示一個檔案內容的分割
	nb.create_vocab_list(); //建立詞列表，用紅黑樹儲存，key-單詞，value-單詞序號(面向文字總計）
	/*建立訓練矩陣，大家肯定想，這個是什麼？
	矩陣的長：文字總數
	矩陣的寬：單詞總計數（不重複的所有文字總計的單詞量）
	矩陣每一個點的值：對應檔案中，對應單詞出現次數
	*/
	nb.get_train_matrix(); 
	//nb.print();  
	nb.train_NB0();

	char  doc1_to_classify[] = "./email/hamParse/1.dat";
	char  doc2_to_classify[] = "./email/spamParse/1.dat";
	cout << "doc1 classified as : " << nb.classify_NB(doc1_to_classify) << endl;
	cout << "doc2 classified as : " << nb.classify_NB(doc2_to_classify) << endl;

	nb.get_error_rate();
	return 0;
}

樸素貝葉斯-過濾垃圾郵件程式碼例項詳解

1.問題描述過濾垃圾郵件 2.思考過程（1）收集資料：提供文字檔案（2）準備資料：將文字檔案解析成詞條向量此處我們需要從給予的文字文件中構建自己的詞列表（將文字內容進行詞分割，過濾不需要的），也就是要建立符合實際情況的文字解析規則和過濾器（此處發現pytho

使用樸素貝葉斯過濾垃圾郵件

split文字分割函式 mySent='This book is the best book on Python or M.L. I have ever laid eyes upon.' ret=mySent.split() print(ret) 輸出 ['This', '

機器學習-資料分析之樸素貝葉斯過濾垃圾郵件

資料分析之過濾垃圾郵件前沿之前也學了一些資料分析的案例從一直沒有記錄，所有準備從現在開始把所學的都記錄在CSDN中。如果大家看到我的博文有什麼不理解或者還想學習更深入的可以去上面的網站。樸素貝葉斯之過濾垃圾郵件使用樸素貝葉斯解決一些生活中的問題。先從文字內容得

基於樸素貝葉斯的垃圾郵件過濾

1.文字切分 #對於一個文字字串，可以使用Python的string.split()方法將其切分 mySent = 'This book is the best book on python or M.L. I have ever laid eyes upon' word

【python與機器學習入門3】樸素貝葉斯2——垃圾郵件分類

參考部落格：樸素貝葉斯基礎篇之言論過濾器（po主Jack-Cui,《——大部分內容轉載自參考書籍：《機器學習實戰》——第四章4.6

樸素貝葉斯演算法----垃圾郵件識別

問題是什麼？問題是，給定一封郵件，判定它是否屬於垃圾郵件。按照先例，我們還是用 D 來表示這封郵件，注意 D 由 N 個單片語成。我們用 h+ 來表示垃圾郵件，h- 表示正常郵件。問題可以形式化地描述為求： P(h+|D) = P(h+) * P(D|h+) / P(D) P

Python實現基於樸素貝葉斯的垃圾郵件分類

聽說樸素貝葉斯在垃圾郵件分類的應用中效果很好，尋思樸素貝葉斯容易實現，就用python寫了一個樸素貝葉斯模型下的垃圾郵件分類。在400封郵件（正常郵件與垃圾郵件各一半）的測試集中測試結果為分類準確率95.15%，在僅僅統計詞頻計算概率的情況下，分類結果還是相當不

我的第一篇學習筆記——使用樸素貝葉斯演算法對文件分類詳解

樸素貝葉斯演算法可以實現對文件的分類，其中最著名的應用之一就是過濾垃圾郵件。先做一個簡單的分類，以論壇的留言為例，構建一個快速的過濾器，來區分哪些留言是負面言論，哪些是正面言論。我對演算法思路的理解：首先計算訓練集中每個詞語分別在正面（負面）文件中出現的概率以及正面（負面

利用樸素貝葉斯分析鳶尾花，程式碼有詳細解釋。

#匯入所需要的包 from sklearn.naive_bayes import GaussianNB import numpy as np import pandas as pd from pandas import Series,DataFrame import matplotlib.pyplo

樸素貝葉斯分類演算法Python程式碼

貝葉斯分類器就是求P(C|F1F2...Fn) = P(F1F2...Fn|C)P(C) / P(F1F2...Fn) 最大值，由於 P(F1F2...Fn) 對於所有的類別都是相同的，可以省略，問題就變成了求 P(F1F2...Fn|C)P(C) 的最大值。樸素貝葉斯分類

python實現貝葉斯推斷——垃圾郵件分類

理論前期準備資料來源資料來源於《機器學習實戰》中的第四章樸素貝葉斯分類器的實驗資料。資料書上只提供了50條資料（25條正常郵件，25條垃圾郵件），感覺資料量偏小，以後打算使用scikit-learn提供的iris資料。資料準備和很

樸素貝葉斯演算法學習及程式碼示例

最近工作中涉及到文字分類問題，於是就簡單的看了一下樸素貝葉斯演算法（Naive Bayes），以前對該演算法僅僅停留在概念上的瞭解，這次系統的查閱資料學習了一下。樸素貝葉斯演算法以貝葉斯

貝葉斯分類器演算法及案例詳解

作者：vicky_siyu 致謝：小龍快跑jly, 巧兒、克力,Esther_or so,雨佳小和尚本文是對貝葉斯分類器（包括樸素貝葉斯分類器，半樸素貝葉斯分類器及貝葉斯網路）演算法的演算及案例的詳細分析。本文只是在學習後進行了總結並加入了自己的理解，如有不妥之處，還望海涵，也希望大家

大資料之Spark（七）--- Spark機器學習，樸素貝葉斯，酒水評估和分類案例學習，垃圾郵件過濾學習案例，電商商品推薦，電影推薦學習案例

一、Saprk機器學習介紹 ------------------------------------------------------------------ 1.監督學習 a.有訓練資料集,符合規範的資料 b.根據資料集，產生一個推斷函式

使用樸素貝葉斯算法簡單實現垃圾郵件過濾

垃圾郵件相關性得到因此 block align 介紹 14. 影響一、算法介紹樸素貝葉斯法，簡稱NB算法，是貝葉斯決策理論的一部分，是基於貝葉斯定理與特征條件獨立假設的分類方法：首先理解兩個概念： · 先驗概率是指根據以往經驗和分析得到的概率，它往往作為“由因求

機器學習實戰中，第四章樸素貝葉斯，過濾垃圾郵件，正則表示式切分郵件內容得出字母的問題解決方法

原文中的程式碼：listOfTokens = re.split(r'\W*', bigString) 修改為：listOfTokens = re.split(r'\W+', bigString)

Python實現樸素貝葉斯演算法 --- 過濾垃圾郵件

# -*- coding:utf-8 -*- import numpy as np import random import re __author__ = 'yangxin' """ 過濾垃圾郵件 """ class FilterSpam(object): #

第4章樸素貝葉斯（文字分類、過濾垃圾郵件、獲取區域傾向）

貝葉斯定理： P ( c

機器學習筆記（2）——使用樸素貝葉斯演算法過濾（中英文）垃圾郵件

在上一篇文章《使用樸素貝葉斯演算法對文件分類詳解》中，我們實現了用樸素貝葉斯演算法對簡單文件的分類，今天我們將利用此分類器來過濾垃圾郵件。 1. 準備資料——文字切分之前演算法中輸入的文件格式為單詞向量，例如['my', 'dog', 'has', 'flea', 'p

利用樸素貝葉斯（Navie Bayes）進行垃圾郵件分類

判斷 ase create numpy water 向量 not in imp img 貝葉斯公式描寫敘述的是一組條件概率之間相互轉化的關系。在機器學習中。貝葉斯公式能夠應用在分類問題上。這篇文章是基於自己的學習所整理。並利用一個垃圾郵件分類的樣例來加深對於理論的理解

樸素貝葉斯-過濾垃圾郵件程式碼例項詳解

1.問題描述

2.思考過程

（1）收集資料：提供文字檔案

（2）準備資料：將文字檔案解析成詞條向量

（3）分析資料：檢查詞條確保解析的正確性

（4）訓練演算法：使用訓練函式進行訓練（其實此處就是統計詞條出現頻率，然後採用極大似然估計或貝葉斯估計等估計法方法用統計的頻率估算概率，總體來說，就是從詞向量計算概率）

（5）測試演算法：使用訓練好的分類器來進行結果分類，並構建測試函式計算文件集的錯誤率 （6）使用演算法：構建完整程式對輸入文件進行分類

3.程式碼例項和講解

相關推薦

（5）測試演算法：使用訓練好的分類器來進行結果分類，並構建測試函式計算文件集的錯誤率
（6）使用演算法：構建完整程式對輸入文件進行分類