1. 程式人生 > >KNN分類與迴歸-C++實現

KNN分類與迴歸-C++實現

#include <iostream>
#include <algorithm>
#include <iomanip>
#include <fstream>
#include <sstream>
#include <cstring> 
#include <string>
#include <vector>
#include <cmath>
#include <time.h>
#include <map>

using namespace std;


struct train_data {
	int index;						//訓練文字序號 
	int emotion_value;				//情感值 
	string emotion;					//情感狀態 
	vector<string> word;			//訓練文字單詞 
	int onehot[1000];				//onehot矩陣中的值 
	double distance;				//距離 
	
	train_data(int a = 0, int b = 0, string c = "", double d = 0.0) {
		index = a;
		emotion_value = b;
		emotion = c;
		distance = d;
		word.clear();
		for (int i = 0; i < 1000; i ++)
			onehot[i] = 0;
	}
};

struct ct {
	string s;						//情感狀態 
	int num;						//次數 
	ct(string a = "", int b = 0) {
		s = a;
		num = b;
	}
};
vector<string> train_text;			//每個完整的訓練文字 
vector<string> all_words;			//所有不同的單詞 ,縱軸 
vector<train_data> all_trains;		//所有訓練文字,橫軸 
int right_sum;						//預測正確的個數 

void reading_file(void );
void get_onehot(void );
void class_calculating(int );
double edistance(train_data , train_data );
double mdistance(train_data , train_data );
bool cmp(const train_data & , const train_data & );
bool cmp2(const ct & , const ct & );

int main() {
	int k; 
	for (k = 1; k < 15; k ++) {
		train_text.clear();
		all_words.clear();
		all_trains.clear();
	
		reading_file();
		get_onehot();
		cout << "input k = " << k << endl;
		cout << "不重複的詞個數 " << all_words.size() << endl;  
		class_calculating(k);	
		break;
	}

	return 0;
}

void reading_file() {
	ifstream train("train.txt");
	char read[100];
	string temp;
	train.getline(read, 100);
	while (!train.eof()) {
		train.getline(read, 100);
		temp = read;
		train_text.push_back(temp);
	}
	train.close();	
	stringstream s;
	int index;
	int emotion_value;
	string emotion;
	string word;
	for (int i = 0; i < train_text.size(); i ++) {
		s.str(train_text[i]);
		s >> index;
		s >> emotion_value;
		s >> emotion;
		
		//建立一個新的訓練文字資料 
		train_data new_train;
		new_train.index = index;
		new_train.emotion_value = emotion_value;
		new_train.emotion = emotion;
				
		while (s != NULL) {
			s >> word;
			
			//統計所有單詞
			bool flag1 = true;
			for (int i = 0; i < all_words.size(); i ++) {
				if (all_words[i] == word) {
					flag1 = false;
					break;					
				}
				else 
					continue;
			}
			if (flag1)
				all_words.push_back(word);
			
			//統計每個訓練文字中的單詞
			bool flag2 = true;
			for (int i = 0; i < new_train.word.size(); i ++) {
				if (new_train.word[i] == word) {
					flag2 = false;
					break;
				}
				else 
					continue;
			}
			if (flag2)
				new_train.word.push_back(word);
		}
		s.clear();
		
		all_trains.push_back(new_train);
	}
	//test 
	/*
	ofstream t("test.txt");
	for (int i = 0; i < all_trains.size(); i ++) {
		t << all_trains[i].index << " " << all_trains[i].emotion_value << " " << all_trains[i].emotion;
		for (int j = 0; j < all_trains[i].word.size(); j ++)
			t << " " << all_trains[i].word[j];
		t << endl;
	}
	
	for (int i = 0; i < all_words.size(); i ++)
		cout << i + 1 << " " << all_words[i] << endl;
	*/
}

void get_onehot() {
	int i, j, k;
	for (i = 0; i < all_trains.size(); i ++) {
		for (j = 0; j < all_trains[i].word.size(); j ++) {
			for (k = 0; k < all_words.size(); k ++) {
				if (all_trains[i].word[j] == all_words[k]) 
					all_trains[i].onehot[k] = 1;
			}
		}
	}
	//cout << "hot" <<endl;
	//test
	/*
	ofstream s("testonehot.txt");
	for (i = 0; i < all_words.size(); i ++)
		s << setw(14) << left << all_words[i];
	s << endl;
	for (j = 0; j < all_trains.size(); j ++) {
		for (k = 0; k < all_words.size(); k ++)
			s << setw(14) << left << all_trains[j].onehot[k];
		s << endl;
	}
	*/
}


void class_calculating(int k) {
	ifstream t("test.txt");
	right_sum = 0;
	char c[100];
	string temp;
	t.getline(c, 100);
	while (t.getline(c, 100)) {
		train_data test_train;
		char *p = strtok(c, " ");
		p = strtok(NULL, " ");
		p = strtok(NULL, " ");
		temp = p;
		test_train.emotion = temp;
		//cout << temp << endl;
		p = strtok(NULL, " ");
		
		while (p != NULL) {
			temp = p;
			bool flag = true;
			for (int i = 0; i < test_train.word.size(); i ++) {
				if (test_train.word[i] == temp) {
					flag = false;
					break;
				}
			}
			if (flag)
				test_train.word.push_back(temp);
			p = strtok(NULL, " ");
		}
		
		double d1 = 0;
		for (int i = 0; i < test_train.word.size(); i ++) {
			bool flag3 = true;
			for (int j = 0; j < all_words.size(); j ++) {
				if (test_train.word[i] == all_words[j]) {
					test_train.onehot[j] = 1;
					flag3 = false;
					break;
				}
				else
					continue;
			}
			if (flag3) {									//如果訓練樣本中沒有這個單詞,但又不能改變原始樣本 
				d1 ++;
			}
		}

		for (int i = 0; i < all_trains.size(); i ++) {
		//all_trains[i].distance = sqrt(d1 + edistance(all_trains[i], test_train));
			double part1 = 0;
			for (int j = 0; j < all_words.size(); j ++)
				part1 += all_trains[i].onehot[j] * test_train.onehot[j];
			all_trains[i].distance = part1/(sqrt(all_trains[i].word.size())*sqrt(test_train.word.size() + d1));
		} 
		sort(all_trains.begin(), all_trains.end(), cmp);	

		vector<ct> v;
		ct node("anger", 0);
		v.push_back(node);
		ct node1("disgust", 0);
		v.push_back(node1);
		ct node2("fear", 0);
		v.push_back(node2);
		ct node3("joy", 0);
		v.push_back(node3);
		ct node4("sad", 0);
		v.push_back(node4);
		ct node5("surprise", 0);
		v.push_back(node5);
		for (int i = 0; i < k; i ++) {
			for (int j = 0; j < v.size(); j ++) {
				if (all_trains[i].emotion == v[j].s) {
					v[j].num ++;
					break;
				}
			}
		}
		sort(v.begin(), v.end(), cmp2);
		if (v.back().s == test_train.emotion) {
			right_sum ++;
		}

			
	}
	cout << "正確個數" << right_sum << endl;
}

double edistance(train_data a, train_data b) {			//歐式 :開方前 
	double total = 0.0;
	for (int i = 0; i < all_words.size(); i ++) {
		total += pow(a.onehot[i] - b.onehot[i], 2);
	}
	
	return total;
}
double mdistance(train_data a, train_data b) {			//曼哈頓 
	double total = 0.0;
	for (int i = 0; i < all_words.size(); i ++) {
		total += abs(a.onehot[i] - b.onehot[i]);
	}
	
	return total;
}
bool cmp(const train_data &a, const train_data &b) {
	return a.distance < b.distance;
}
bool cmp2(const ct &a, const ct &b ) {
	return a.num < b.num;	
}

KNN迴歸:夾角餘弦

#include <iostream>
#include <algorithm>
#include <iomanip>
#include <fstream>
#include <sstream>
#include <cstring> 
#include <string>
#include <vector>
#include <cmath>
#include <time.h>
#include <map>

using namespace std;


struct train_data {
	int index;						//訓練文字序號 
	int emotion_value;				//情感值 
	string emotion;					//情感狀態 
	vector<string> word;			//訓練文字單詞 
	int onehot[1000];			//onehot矩陣中的值 
	double distance;				//距離 
	vector<double> fre_set; 
	
	train_data(int a = 0, int b = 0, string c = "", double d = 0.0) {
		index = a;
		emotion_value = b;
		emotion = c;
		distance = d;
		word.clear();
		fre_set.clear();
		for (int i = 0; i < 1000; i ++)
			onehot[i] = 0;
	}
};

struct ct {
	string s;
	int num;
	ct(string a = "", int b = 0) {
		s = a;
		num = b;
	}
};

vector<string> train_text;			//每個完整的訓練文字 
vector<string> all_words;			//所有不同的單詞 ,縱軸 
vector<train_data> all_trains;		//所有訓練文字,橫軸 
int right_sum;						//預測正確的個數 

void reading_file(void );
void get_onehot(void );
void regre_calculating(int );
double edistance(train_data , train_data );
double mdistance(train_data , train_data );
bool cmp(const train_data & , const train_data & );


int main() {
	train_text.clear();
	all_words.clear();
	all_trains.clear();
	
	reading_file();
	//cout << all_words.size() << endl; 904
	get_onehot();
	int k;
	cout << "input k = ";
	cin >> k;
	regre_calculating(k);
	
	//for (int i = 0; i <= all_trains[2].onehot.size(); i ++)
	//	cout << all_trains[2].onehot[i] << endl;
	
	return 0;
}

void reading_file() {
	ifstream t("Dataset_train.csv");	
	char c[150];
	string temp;
	t.getline(c, 150);
	while (t.getline(c, 150)) {
		train_data new_train;
		char d[150];
		strcpy(d, c);
		char *p = strtok(c, ",");
		p = strtok(NULL, ",");
		//cout << p << endl;
		char *p2 = strtok(p, " ");
		while (p2 != NULL) {
			string word = p2;
			//統計所有單詞
			bool flag1 = true;
			for (int i = 0; i < all_words.size(); i ++) {
				if (all_words[i] == word) {
					flag1 = false;
					break;					
				}
				else 
					continue;
			}
			if (flag1)
				all_words.push_back(word);
			
			//統計每個訓練文字中的單詞
			bool flag2 = true;
			for (int i = 0; i < new_train.word.size(); i ++) {
				if (new_train.word[i] == word) {
					flag2 = false;
					break;
				}
				else 
					continue;
			}
			if (flag2)
				new_train.word.push_back(word);
			p2 = strtok(NULL, " ");
		}
		char *p3 = strtok(d, ",");
		p3 = strtok(NULL, ",");
		p3 = strtok(NULL, ",");
		stringstream ss;
		double fre;
		while (p3 != NULL) {
			temp = p3;
			ss.str(temp);
			ss >> fre;
			new_train.fre_set.push_back(fre);
			ss.clear();
			p3 = strtok(NULL, ",");
		}
		/*for (int i = 0; i < new_train.word.size(); i ++)
			cout << new_train.word[i] << " ";
		for (int i = 0; i < new_train.fre_set.size(); i ++)
			cout << new_train.fre_set[i] << " ";
		break;
		*/		
		all_trains.push_back(new_train);				
	}
	//cout << all_words.size();
}

void get_onehot() {
	int i, j, k;
	for (i = 0; i < all_trains.size(); i ++) {
		for (j = 0; j < all_trains[i].word.size(); j ++) {
			for (k = 0; k < all_words.size(); k ++) {
				if (all_trains[i].word[j] == all_words[k]) 
					all_trains[i].onehot[k] = 1;
			}
		}
	}
}

void regre_calculating(int k) {
	ifstream t("Dataset_validation.csv");
	char c[150];
	string temp;
	t.getline(c, 150);
	ofstream out("14353324_xiangketing_regression.txt");
	while (t.getline(c, 150)) {
		train_data test_train;
		char *p = strtok(c, ",");
		p = strtok(NULL, ",");
		//cout << p << endl;
		char *p2 = strtok(p, " ");
		while (p2 != NULL) {
			temp = p2;
			bool flag = true;
			for (int i = 0; i < test_train.word.size(); i ++) {
				if (test_train.word[i] == temp) {
					flag = false;
					break;
				}
			}
			if (flag)
				test_train.word.push_back(temp);
			p2 = strtok(NULL, " ");
		}
		
		double d1 = 0;
		for (int i = 0; i < test_train.word.size(); i ++) {
			bool flag3 = true;
			for (int j = 0; j < all_words.size(); j ++) {
				if (test_train.word[i] == all_words[j]) {
					test_train.onehot[j] = 1;
					flag3 = false;
					break;
				}
				else
					continue;
			}
			if (flag3) {									//如果訓練樣本中沒有這個單詞,但又不能改變原始樣本 
				d1 ++;
			}
		}
		for (int i = 0; i < all_trains.size(); i ++) {
			//all_trains[i].distance = sqrt(d1 + edistance(all_trains[i], test_train));
			double part1 = 0;
			for (int j = 0; j < all_words.size(); j ++)
				part1 += all_trains[i].onehot[j] * test_train.onehot[j];
			all_trains[i].distance = part1/(sqrt(all_trains[i].word.size())*sqrt(test_train.word.size() + d1));
		} 
		sort(all_trains.begin(), all_trains.end(), cmp);	
		double a[6];
		double sum = 0;
		for (int i = 0; i < 6; i ++) {
			double value = 0;
			for (int j = 0; j < k; j ++) {
				value += all_trains[j].fre_set[i] * all_trains[j].distance;
			}
			a[i] = value;
			sum += value;
		}
		out << a[0] / sum << '\t' << a[1] / sum << '\t' << a[2] / sum << '\t' 
			<< a[3] / sum << '\t' << a[4] / sum << '\t' << a[5] / sum << endl;
	}
}

double edistance(train_data a, train_data b) {
	double total = 0.0;
	for (int i = 0; i < all_words.size(); i ++) {
		total += pow(a.onehot[i] - b.onehot[i], 2);
	}
	
	return total;
}
double mdistance(train_data a, train_data b) {
	double total = 0.0;
	for (int i = 0; i < all_words.size(); i ++) {
		total += abs(a.onehot[i] - b.onehot[i]);
	}
	
	return total;
}

bool cmp(const train_data &a, const train_data &b) {
	return a.distance > b.distance;
}