C++單刷《機器學習實戰》——kNN演算法完整程式碼
阿新 • • 發佈:2018-11-25
#include <iostream> #include <cmath> #include<map> #include<string> #include<sstream> #include<fstream> #include<vector> #include<algorithm> using namespace std; double group[4][2] = { { 1.0, 1.1 }, { 1.0, 1.0 }, { 0, 0 }, { 0, 0.1 } }; string labels[4] = { "A", "A", "B", "B" }; struct man { double fly; double game; double icecream; string eval; }; void sort(double* data, int n, int k) //氣泡排序,採用氣泡排序的目的是以最快速度找到最大的前k個值 //data:要排序的陣列,n:陣列大小,k:要找到的前k個值 { int temp; for (int i = 0; i < k; i++) { for (int j = i+1; j < n; j++) { if (*(data + i) > *(data + j)) { temp = *(data + i); *(data + i) = *(data + j); *(data + j) = temp; } } } } void sortIndex(double* data, int* sorted_index2, int n) //排序,並返回排序後的原陣列索引 //data:原始陣列,sorted_index2:排序後的原陣列索引,n:陣列大小 { int index = 0; int* sorted_index = new int[n]; for (int i = 0; i < n; i++) { index = 0; for (int j = 0; j < n; j++) { if (data[i] > data[j]) index++; else if (data[i] == data[j] && i > j) index++; } sorted_index[i] = index; } for (int i = 0; i < n; i++) { for (int j = 0; j < n; j++) { if (i == sorted_index[j]) sorted_index2[i] = j; } } delete sorted_index; } vector<man> readFile(const char* file_name) //從檔案中讀取資料並存入結構體陣列 { string data_str; vector<man> data_list; fstream file; file.open(file_name,ios::in); if (file.is_open()) { while (getline(file, data_str)) { man data; istringstream record(data_str); record >> data.fly; record >> data.game; record >> data.icecream; record >> data.eval; data_list.push_back(data); } } return data_list; } void data2matrix(vector<man> data_list, double* dataSet, string labels[], double& length_fly, double& length_game, double& length_icecream) //將結構體陣列轉化為二維矩陣,並歸一化 //data_list:結構體陣列,dataSet:轉化為的二維矩陣,labels:標籤陣列,length_fly....:樣本特徵最大值與最小值之差 { int index = 0; auto it = data_list.begin(); double min_fly = it->fly; double max_fly = it->fly; double min_game = it->game; double max_game = it->game; double min_icecream = it->icecream; double max_icecream = it->icecream; for (; it != data_list.end(); ++it) { if (min_fly > it->fly) min_fly = it->fly; if (max_fly < it->fly) max_fly = it->fly; if (min_game > it->game) min_game = it->game; if (max_game < it->game) max_game = it->game; if (min_icecream > it->icecream) min_icecream = it->icecream; if (max_icecream < it->icecream) max_icecream = it->icecream; } length_fly = max_fly - min_fly; length_game = max_game - min_game; length_icecream = max_icecream - min_icecream; for (auto it = data_list.begin(); it != data_list.end(); ++it) { *(dataSet + index * 3) = it->fly / length_fly; *(dataSet + index * 3 + 1) = it->game / length_game; *(dataSet + index * 3 + 2) = it->icecream / length_icecream; labels[index] = it->eval; ++index; } } void data2matrix2(man person, double* data,double length_fly, double length_game, double length_icecream) { data[0] = person.fly / length_fly; data[1] = person.game / length_game; data[2] = person.icecream / length_icecream; } string classify(double* inX,double* dataSet,string labels[],int k,int size,int dataSetSize) //kNN分類演算法 //inX:未分類的輸入資料,dataSet:樣本集,labels:標籤,k:k值,size:資料的特徵數量,dataSetSize:樣本集數量 { double sum = 0; double* diff_array = new double[size]; double* diff_all = new double[dataSetSize]; int* sorted_index = new int[dataSetSize]; string label; map<string, int> label_count; for (int i = 0; i < dataSetSize; i++) //計算當前點與各樣本點的歐式距離,並存入陣列diff_array { sum = 0; for (int j = 0; j < size; j++) { diff_array[j] = *(inX + j) - *(dataSet + i*size + j); sum += (diff_array[j] * diff_array[j]); } diff_all[i] = sqrt(sum); } //排序,並返回排序後的原陣列索引 sortIndex(diff_all, sorted_index, dataSetSize); for (int i = 0; i < k; i++) //計算前k個索引對應標籤的出現次數,存入關聯容器label_count { label = labels[sorted_index[i]]; ++label_count[label]; } //找出出現次數最多的標籤,返回 auto map_it = label_count.begin(); label = map_it->first; int max_count = map_it->second; for (; map_it != label_count.end(); map_it++) { if (max_count < map_it->second) { max_count = map_it->second; label = map_it->first; } } delete diff_array; delete diff_all; delete sorted_index; return label; } int main() { /*string result; string line; double point[2]; cout << "please input the coodinate of the pixel" << endl; while (getline(cin, line)) { istringstream record(line); record >> point[0]; record >> point[1]; result = classify(point, &group[0][0], labels, 3, 2, 4); cout << "the result is: " << result << endl; cout << "please input the coodinate of the pixel" << endl; }*/ vector<man> data_list; data_list = readFile("datingTestSet.txt"); int size = 3; int dataSetSize = data_list.size(); double* dataSet = new double[dataSetSize*3]; string* labels = new string[dataSetSize]; double length_fly = 0; double length_game = 0; double length_icecream = 0; data2matrix(data_list, dataSet, labels, length_fly, length_game, length_icecream); //測試,ratio為測試集佔資料集總量 double ratio = 0.1; string result; int error_count = 0; int num_test = dataSetSize * ratio; string label; for (int i = 0; i < num_test; i++) { result = classify(dataSet + i * 3, dataSet + num_test * 3, labels + num_test, 3, 3, dataSetSize - num_test); cout <<i<<"times "<<"The classifier came back with: "<< result <<",the real answer is "<<labels[i]<< endl; label = labels[i]; if (result != label) { ++error_count; } } double err_rate = (double)error_count / (double)num_test; cout << "The total error rate is: " << err_rate << endl; man person; double* data = new double[3]; string line; cout << "Please input the time of fly,game and the consume of icrcreame" << endl; while (getline(cin, line)) { cout << "Please input the time of fly,game and the consume of icrcreame" << endl; istringstream record(line); record >> person.fly; record >> person.game; record >> person.icecream; data2matrix2(person, data, length_fly, length_game, length_icecream); result = classify(data, dataSet, labels, 3, 3, dataSetSize); person.eval = result; cout << result << endl; } delete dataSet; delete data; return 0; }