1. 程式人生 > >樸素貝葉斯-過濾垃圾郵件程式碼例項詳解

樸素貝葉斯-過濾垃圾郵件程式碼例項詳解

1.問題描述

過濾垃圾郵件

2.思考過程

(1)收集資料:提供文字檔案

(2)準備資料:將文字檔案解析成詞條向量

此處我們需要從給予的文字文件中構建自己的詞列表(將文字內容進行詞分割,過濾不需要的),也就是要建立符合實際情況的文字解析規則和過濾器(此處發現python對這方面的支援太好用,用其他語言比如c++當然也可以),程式碼實現中可以為此單獨寫一個函式

(3)分析資料:檢查詞條確保解析的正確性

(4)訓練演算法:使用訓練函式進行訓練(其實此處就是統計詞條出現頻率,然後採用極大似然估計或貝葉斯估計等估計法方法用統計的頻率估算概率,總體來說,就是從詞向量計算概率)

採用貝葉斯公式


(5)測試演算法:使用訓練好的分類器來進行結果分類,並構建測試函式計算文件集的錯誤率
(6)使用演算法:構建完整程式對輸入文件進行分類

3.程式碼例項和講解

/*
* code list 4-1 : transfer func from docs list to vocabulary list
* code list 4-2 : training func on Naive Bayes Classifier
* code list 4-3 : naive bayes classify function
* add code list 4-4 : naive bayes bag-of-word model
* add code list 4-5 : text parse : textParse.py and spam email test function : get_error_rate()
* */

#include<iostream>  
#include<map>  
#include<set>  
#include<cmath>  
#include<vector>  
#include<algorithm>  
#include<numeric>  
#include<cstring>  
#include<string>
#include<stdio.h>  
#include<cstdlib>  
#include<fstream>  
#include<stdlib.h>  
//#include<unistd.h>  
#include<string.h>  
using namespace std;

class NaiveBayes         //貝葉斯分類器
{
private:
	vector< vector<string> > list_of_docs;
	vector<int> list_classes;   //類別向量
	map<string, int>  my_vocab_list;
	int *return_vec;
	vector< vector<int> > train_mat;  //訓練矩陣
	vector<float> p0vect;
	vector<float> p1vect;
	float p_abusive;
	ifstream fin;
	ofstream fout;
	int test_data_num;

public:
	NaiveBayes()
	{
		cout << "please input the num of test data which should be less than 24 : " << endl;
		cin >> test_data_num;
		vector<string> vec;
		string word;
		string filename;
		char buf[3];
		string buf_str;
		//分類器類初始化,執行解析指令碼,將分割後的詞向量存入list_of_docs,list_of_docs中的每一個元素對應一個檔案內容
		for (int i = test_data_num + 1; i <= 25; i++)
		{
			sprintf(buf, "%d", i);  //convert digit to string  
			vec.clear();
			buf_str = buf;
			filename = "./email/hamParse/" + buf_str + ".dat";
			//cout<<"filename : "<<filename<<endl;  
			fin.open(filename.c_str());
			if (!fin)
			{
				cerr << "open the file " << filename << " error" << endl;
				exit(1);
			}
			while (fin >> word)
			{
				vec.push_back(word);
			}
			list_of_docs.push_back(vec);
			list_classes.push_back(0);
			filename.clear();
			fin.close();
		}

		for (int i = test_data_num + 1; i <= 25; i++)
		{
			sprintf(buf, "%d", i);
			vec.clear();
			buf_str = buf;
			filename = "./email/spamParse/" + buf_str + ".dat";
			//cout<<"filename : "<<filename<<endl;  
			fin.open(filename.c_str());
			if (!fin)
			{
				cerr << "open the file " << filename << " error" << endl;
			}
			while (fin >> word)
			{
				vec.push_back(word);
			}
			list_of_docs.push_back(vec);
			list_classes.push_back(1);
			filename.clear();
			fin.close();
		}

	}

	~NaiveBayes()
	{
		fin.close();
		fout.close();
		list_of_docs.clear();
		list_classes.clear();
		my_vocab_list.clear();
		train_mat.clear();
		//delete [] return_vec;  
		p0vect.clear();
		p1vect.clear();
	}


	void create_vocab_list()
	{
		vector< vector<string> > ::iterator it = list_of_docs.begin();
		int index = 1;
		while (it != list_of_docs.end())
		{
			//vector<string> vec( *it.begin(),*it.end() );  
			vector<string> vec = *it;

			vector<string> ::iterator tmp_it = vec.begin();

			while (tmp_it != vec.end())
			{
				//cout<<*tmp_it<<" ";  
				if (my_vocab_list[*tmp_it] == 0)
				{
					my_vocab_list[*tmp_it] = index++; //index is the location of the vovabulary  
				}
				tmp_it++;
			}
			it++;
		}

	}//create_vocab_list  

	//set some one word to vec with 0 and 1.  
	void beg_of_words_to_vec(int idx)
	{
		//cout<<"set of words to vec begin the document id is : "<<idx<<endl;  
		int len = my_vocab_list.size() + 1;
		return_vec = new int[len](); //pay attention to the difference between "new int[len]". initalize all the element to zero.  
		fill(return_vec, return_vec + len, 0);
		vector< vector<string> >::iterator it = list_of_docs.begin() + idx - 1;
		vector<string> vec = *it;
		vector<string> ::iterator itt = vec.begin();
		int pos = 0;
		while (itt != vec.end())
		{
			//          cout<<*itt<<" ";  
			pos = my_vocab_list[*itt];
			if (pos != 0)
			{
				return_vec[pos] += 1;
			}
			itt++;
		}
	}//beg_of_words_to_vec  

	void get_train_matrix()
	{
		cout << "get train matrix begin : " << endl;
		train_mat.clear();
		for (int i = 1; i <= list_of_docs.size(); i++)
		{
			beg_of_words_to_vec(i);
			vector<int> vec(return_vec, return_vec + my_vocab_list.size() + 1);
			train_mat.push_back(vec);
			delete[]return_vec;
		}
	}//get train matrix  

	void print()
	{
		cout << "print the train matrix begin : " << endl;
		vector< vector<int> > ::iterator it = train_mat.begin();
		while (it != train_mat.end())
		{
			vector<int> vec = *it;
			vector<int> ::iterator itt = vec.begin();
			while (itt != vec.end())
			{
				cout << *itt << " ";
				itt++;
			}
			cout << endl;
			it++;
		}

	}//print()  

	void train_NB0()
	{
		int num_train_docs = train_mat.size();//sizeof(docs_lists)/sizeof(docs_lists[0]);  
		cout << "num_train_docs = " << num_train_docs << endl;
		int num_words = train_mat[0].size() - 1;
		/* calculatr the sum of the abusive classes */
		int sum = accumulate(list_classes.begin(), list_classes.end(), 0);
		cout << "sum = " << sum << endl;
		//float p_abusive = (float)sum/(float)num_train_docs;  
		p_abusive = (float)sum / (float)num_train_docs;
		cout << "p_abusive = " << p_abusive << endl;

		//vector<float> p0vect(train_mat[0].size(),1); //the frequency of each word in non-absusive docs  
		p0vect.resize(train_mat[0].size(), 1);  //先將所有單詞出現次數初始化為nameda = 1,拉普拉斯平滑,避免有概率值為0
		//vector<float> p1vect(train_mat[0].size(),1); //the frequency of each word in abusive docs  
		p1vect.resize(train_mat[0].size(), 1);
		printf("p0num.size() = %d , p1num.size() = %d\n", p0vect.size(), p1vect.size());
		float p0Denom = 2.0; //the total number of words in non-abusive docs,初始化為類別總計2
		float p1Denom = 2.0; //the total number of words in abusive docs  

		/* calculate the p0num,p1num,p0Denom,p1Denom */
		for (int i = 0; i<list_classes.size(); i++)
		{
			if (list_classes[i] == 1)  //abusive doc  
			{
				for (int j = 0; j<p1vect.size(); j++)
				{////有爭議的是此處地方,對於概率分母p1Denom和p0Denom的求解暫且不明白
					p1vect[j] += train_mat[i][j];
					if (train_mat[i][j] == 1)
						p1Denom += 1;
				}
			}
			else   //non-abusive doc  
			{
				for (int j = 0; j<p0vect.size(); j++)
				{
					p0vect[j] += train_mat[i][j];
					if (train_mat[i][j] == 1)
						p0Denom += 1;
				}
			}
		}

		for (int i = 0; i<p1vect.size(); i++)
		{
			p0vect[i] = log(p0vect[i] / p0Denom);    //計算先驗概率
			p1vect[i] = log(p1vect[i] / p1Denom);    
		}

		cout << endl;
	}

	int classify_NB(const char  *filename)
	{
		return_vec = new int[my_vocab_list.size() + 1]();

		fin.open(filename);
		if (!fin)
		{
			cerr << "fail to open the file " << filename << endl;
			exit(1);
		}
		string word;
		while (fin >> word)
		{
			int pos = my_vocab_list[word];
			if (pos != 0)
			{
				return_vec[pos] += 1;
			}
		}
		fin.close();

		cout << endl;
		float p1 = inner_product(p1vect.begin() + 1, p1vect.end(), return_vec + 1, 0) + log(p_abusive);
		float p0 = inner_product(p0vect.begin() + 1, p0vect.end(), return_vec + 1, 0) + log(1 - p_abusive);

		cout << "p1 = " << p1 << "  " << "p0 = " << p0 << endl;

		if (p1>p0)
		{
			return 1;
		}
		else
		{
			return 0;
		}
	}

	void get_error_rate()
	{
		string filename;
		char buf[3];
		string buf_str;
		int error_count = 0;
		for (int i = 1; i <= test_data_num; i++)
		{
			sprintf(buf, "%d", i);
			buf_str = buf;
			filename = "./email/hamParse/" + buf_str + ".dat";
			if (classify_NB(filename.c_str()) != 0)
			{
				error_count++;
			}

			filename = "./email/spamParse/" + buf_str + ".dat";
			if (classify_NB(filename.c_str()) != 1)
			{
				error_count++;
			}
		}
		cout << "the error rate is : " << (float)error_count / (float)(2 * test_data_num) << endl;

	}
};

int main()
{
	NaiveBayes nb;        //文字分割,存入vector< vector<string> > list_of_docs,list_of_docs中的每一項都表示一個檔案內容的分割
	nb.create_vocab_list(); //建立詞列表,用紅黑樹儲存,key-單詞,value-單詞序號(面向文字總計)
	/*建立訓練矩陣,大家肯定想,這個是什麼?
	矩陣的長:文字總數
	矩陣的寬:單詞總計數(不重複的所有文字總計的單詞量)
	矩陣每一個點的值:對應檔案中,對應單詞出現次數
	*/
	nb.get_train_matrix(); 
	//nb.print();  
	nb.train_NB0();

	char  doc1_to_classify[] = "./email/hamParse/1.dat";
	char  doc2_to_classify[] = "./email/spamParse/1.dat";
	cout << "doc1 classified as : " << nb.classify_NB(doc1_to_classify) << endl;
	cout << "doc2 classified as : " << nb.classify_NB(doc2_to_classify) << endl;

	nb.get_error_rate();
	return 0;
}