機器學習：決策樹cart演算法在分類與迴歸的應用（上）

阿新 • • 發佈：2019-02-07

#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include

using namespace std;

//置信水平取0.95時的卡方表
const double CHI[18] = { 0.004,0.103,0.352,0.711,1.145,1.635,2.167,2.733,3.325,3.94,4.575,5.226,5.892,6.571,7.261,7.962 };
/*根據多維陣列計算卡方值*/
template
double cal_chi(Comparable **arr, int row, int col) {
	vector rowsum(row);
	vector colsum(col);
	Comparable totalsum = static_cast(0);//強制將0轉換為Comparable型
	//cout<<"observation"<right.first;
		   }
		};
/* 下面這三個資料結構是來存在在哪種屬性下的某一類的個數*/
typedef map MAP_REST_COUNT;
typedef map MAP_ATTR_REST;
typedef vector VEC_STATI;

const int ATTR_NUM = 6;       //自變數的維度
vector X(ATTR_NUM);
int rest_number;        //因變數的種類數，即類別數
vector > classes;      //把類別、對應的記錄數存放在一個數組中
int total_record_number;        //總的記錄數
vector > inputData;      //原始輸入資料
vector > testinputData;      //測試輸入資料

class node {
public:
	node* parent;       //父節點
	node* leftchild;        //左孩子節點
	node* rightchild;       //右孩子節點
	string cond;        //分枝條件
	string decision;        //在該節點上作出的類別判定
	double precision;       //判定的正確率
	int record_number;      //該節點上涵蓋的記錄個數
	int size;       //子樹包含的葉子節點的數目
	int index;      //層次遍歷樹，給節點標上序號
	double alpha;   //表面誤差率的增加量
	node() {
		parent = NULL;
		leftchild = NULL;
		rightchild = NULL;
		precision = 0.0;
		record_number = 0;
		size = 1;
		index = 0;
		alpha = 1.0;
	}
	node(node* p) {
		parent = p;
		leftchild = NULL;
		rightchild = NULL;
		precision = 0.0;
		record_number = 0;
		size = 1;
		index = 0;
		alpha = 1.0;
	}
	node(node* p, string c, string d) :cond(c), decision(d) {
		parent = p;
		leftchild = NULL;
		rightchild = NULL;
		precision = 0.0;
		record_number = 0;
		size = 1;
		index = 0;
		alpha = 1.0;
	}
	void printInfo() {
		cout << "index:" << index << "\tdecisoin:" << decision << "\tprecision:" << precision << "\tcondition:" << cond << "\tsize:" << size;
		if (parent != NULL)
			cout << "\tparent index:" << parent->index;
		if (leftchild != NULL)
			cout << "\tleftchild:" << leftchild->index << "\trightchild：" << rightchild->index;
		cout << endl;
	}
	void printTree() {
		printInfo();
		if (leftchild != NULL)
			leftchild->printTree();
		if (rightchild != NULL)
			rightchild->printTree();
	}
};
/* 讀取測試檔案資料，採取的是c++字串流的讀取方式
得到結果：testinputData 資料來源
*/
int readtestInput(string filename) {
	ifstream ifs(filename.c_str());
	if (!ifs) {
		cerr << "open inputfile failed!" << endl;
		return -1;
	}
	map catg;
	string line;
	getline(ifs, line);
	string item;
	istringstream strstm(line);
	strstm >> item;
	for (int i = 0; i> item;
		X[i] = item;
	}
	while (getline(ifs, line)) {
		vector conts(ATTR_NUM + 2);
		istringstream strstm(line);
		//strstm.str(line);
		for (int i = 0; i> item;
			conts[i] = item;
			if (i == conts.size() - 1)
				catg[item]++;
		}
		testinputData.push_back(conts);
	}
	total_record_number = testinputData.size();
	ifs.close();
	return 0;
}
/* 讀取檔案資料，採取的是c++字串流的讀取方式
 得到結果：inputData 資料來源
		   classes 分類標籤以及個數（first：哺乳類，second：6）
		   rest_number 分類的種類數
*/
int readInput(string filename) {
	ifstream ifs(filename.c_str());
	if (!ifs) {
		cerr << "open inputfile failed!" << endl;
		return -1;
	}
	map catg;
	string line;
	getline(ifs, line);
	string item;
	istringstream strstm(line);
	strstm >> item;
	for (int i = 0; i> item;
		X[i] = item;
	}
	while (getline(ifs, line)) {
		vector conts(ATTR_NUM + 2);
		istringstream strstm(line);
		//strstm.str(line);
		for (int i = 0; i> item;
			conts[i] = item;
			if (i == conts.size() - 1)
				catg[item]++;
		}
		inputData.push_back(conts);
	}
	total_record_number = inputData.size();
	ifs.close();
	map::const_iterator itr = catg.begin();//將catg歸類結果放入classes中
	while (itr != catg.end()) {
		classes.push_back(make_pair(itr->first, itr->second));
		itr++;
	}
	rest_number = classes.size();//標籤分為幾類
	return 0;
}

/*根據inputData作出一個統計stati，統計的是在哪種屬性下的某類的個數。*/
void statistic(vector > &inputData, VEC_STATI &stati) {
	for (int i = 1; isecond).find(rest);
				if (iter == (itr->second).end()) {
					(itr->second).insert(make_pair(rest, 1));
				}
				else {
					iter->second += 1;
				}
			}
		}
		stati.push_back(attr_rest);
	}
}

/*依據某條件作出分枝時，inputData被分成兩部分*/
void splitInput(vector > &inputData, int fitIndex, string cond, vector > &LinputData, vector > &RinputData) {
	for (int i = 0; i > &inputData) {
	for (int i = 0; i < ATTR_NUM + 2; ++i) {
		for (int j = 0; j < inputData.size(); ++j) {
			cout << inputData[j][i] << "\t";
		}
	}cout << endl;
}
void printStati(VEC_STATI &stati) {
	for (int i = 0; ifirst;
			MAP_REST_COUNT::const_iterator iter = (itr->second).begin();
			while (iter != (itr->second).end()) {
				cout << "\t" << iter->first << "\t" << iter->second;
				iter++;
			}
			itr++;
			cout << endl;
		}
		cout << endl;
	}
}

void split(node *root, vector > &inputData, vector > classes) {
	//root->printInfo();
	root->record_number = inputData.size();
	VEC_STATI stati;
	statistic(inputData, stati);
	//printStati(stati);
	//for(int i=0;i > fitleftclasses;//左樹的分類標籤以及個數
	vector > fitrightclasses;//右樹的分類標籤以及個數
	int fitleftnumber;//左樹記錄數
	int fitrightnumber;
	for (int i = 0; ifirst;     //判定的條件，即到達左孩子的條件，屬性
											   //cout<<"cond 為"< > leftclasses(classes);     //左孩子節點上類別、及對應的數目
			vector > rightclasses(classes);    //右孩子節點上類別、及對應的數目
			int leftnumber = 0;       //左孩子節點上包含的類別數目
			int rightnumber = 0;      //右孩子節點上包含的類別數目
			for (int j = 0; jsecond).find(rest);//
				if (iter2 == (itr->second).end()) {      //沒找到，則對應類別以及類別樹就全部在右樹
					leftclasses[j].second = 0;
					rightnumber += rightclasses[j].second;
				}
				else {       //找到，則右邊樹對應的種類以及個數就是總體的減去左邊的種類數
					leftclasses[j].second = iter2->second;
					leftnumber += leftclasses[j].second;
					rightclasses[j].second -= (iter2->second);
					rightnumber += rightclasses[j].second;
				}
			}
			/**if(leftnumber==0 || rightnumber==0){
			cout<<"左右有一邊為空"<cond<size)++;
		travel = travel->parent;
	}

	node *LChild = new node(root);        //建立左右孩子
	node *RChild = new node(root);
	root->leftchild = LChild;
	root->rightchild = RChild;
	int maxLcount = 0;
	int maxRcount = 0;
	string Ldicision, Rdicision;
	for (int i = 0; imaxLcount) {
			maxLcount = fitleftclasses[i].second;
			Ldicision = fitleftclasses[i].first;
		}
		if (fitrightclasses[i].second>maxRcount) {
			maxRcount = fitrightclasses[i].second;
			Rdicision = fitrightclasses[i].first;
		}
	}
	LChild->decision = Ldicision;
	RChild->decision = Rdicision;
	//LChild->precision = 1.0*maxLcount / fitleftnumber;
	//RChild->precision = 1.0*maxRcount / fitrightnumber;

	/*遞迴對左右孩子進行分裂*/
	vector > LinputData, RinputData;
	splitInput(inputData, fitIndex, fitCond, LinputData, RinputData);
	//cout<<"左邊inputData行數:"< > &testinputData) {
	int i=0;
	int fitIndex;
	total_record_number = testinputData.size();
	node *LChild= new node(root);
	node *RChild= new node(root);
	vector > LinputData, RinputData;
	LChild =root->leftchild;
	RChild = root->rightchild;
	if (root->leftchild == NULL)
		return;
	string cond = root->cond;//分支條件是字串：屬性=屬性下的分類，一下是對字串的操作
	string::size_type pos = cond.find("=");
	string pre = cond.substr(0, pos);//將字串前0-pos的位置的子字串賦予pre
	string post = cond.substr(pos + 1);//在此節點上的分支
	for(int index=0;indexrecord_number = LinputData.size();
	RChild->record_number = RinputData.size();
	//printinputData(LinputData);
	//printinputData(RinputData);
	/*計算正確率*/
	for (int j = 0; j < LinputData.size(); ++j) {
		string rest = LinputData[j][ATTR_NUM + 1];//左樹這一行的標籤
		if (rest == LChild->decision)
			i++;
	}
	if (LChild->record_number == 0)
		LChild->precision = 0;
	else
		LChild->precision=1.0*i/LChild->record_number;
	i = 0;
	for (int j = 0; j < RinputData.size(); ++j) {
		string rest = RinputData[j][ATTR_NUM + 1];//右樹這一行的標籤
		if (rest == RChild->decision)
			i++;
	}
	if (RChild->record_number == 0)
		RChild->precision=0;
	else
		RChild->precision = 1.0*i/RChild->record_number;
	if(LChild->leftchild!=NULL)
		pruneprecision(LChild,LinputData);

	if(RChild->leftchild!=NULL)
		pruneprecision(RChild, RinputData);
}
/*計運算元樹的誤差代價*/
double calR2(node *root) {
	if (root->leftchild == NULL)//葉子結點是沒有左右子樹的
		return (1 - root->precision)*root->record_number / total_record_number;
	else
		return calR2(root->leftchild) + calR2(root->rightchild);
}
/*層次遍歷樹，給節點標上序號*/
void index(node *root) {
	int i = 1;
	queue que;
	que.push(root);
	while (!que.empty()) {
		node* n = que.front();
		que.pop();
		n->index = i++;
		if (n->leftchild != NULL) {
			que.push(n->leftchild);
			que.push(n->rightchild);
		}
	}
}



/*層次遍歷樹，給節點標上序號。同時計算alpha*/
void calalpha(node *root, priority_queue, MyCompare> &pq) {
	int i = 1;
	queue que;
	que.push(root);
	while (!que.empty()) {
		node* n = que.front();
		que.pop();
		n->index = i++;
		if (n->leftchild != NULL) {
			que.push(n->leftchild);
			que.push(n->rightchild);
			//計算表面誤差率的增量
			double r1 = (1 - n->precision)*n->record_number / total_record_number;      //節點的誤差代價
			double r2 = calR2(n);
			n->alpha = (r1 - r2) / (n->size - 1);
			pq.push(MyTriple(n->alpha, n->size, n->index));
		}
	}
}

/*剪枝*/
void prune(node *root, priority_queue, MyCompare> &pq) {
	MyTriple triple = pq.top();
	int i = triple.third;
	queue que;
	que.push(root);
	while (!que.empty()) {
		node* n = que.front();
		que.pop();
		if (n->index == i) {
			cout << "將要剪掉" << i << "的左右子樹" << endl;
			n->leftchild = NULL;
			n->rightchild = NULL;
			int s = n->size - 1;
			node *trav = n;
			while (trav != NULL) {
				trav->size -= s;
				trav = trav->parent;
			}
			break;
		}
		else if (n->leftchild != NULL) {
			que.push(n->leftchild);
			que.push(n->rightchild);
		}
	}
}

void test(string filename, node *root,int labels) {
	ifstream ifs(filename.c_str());
	if (!ifs) {
		cerr << "open inputfile failed!" << endl;
		return;
	}
	string line;
	getline(ifs, line);
	string item;
	istringstream strstm(line);     //跳過第一行
	map independent;       //自變數，即分類的依據
	while (getline(ifs, line)) {
		istringstream strstm(line);
		//strstm.str(line);
		strstm >> item;
		cout << item << "\t";
		for (int i = 0; i> item;
			independent[X[i]] = item;
		}
		node *trav = root;
		while (trav != NULL) {
			if (trav->leftchild == NULL) {
				if (labels >0) {
					cout << (trav->decision) << "\t置信度:" << (trav->precision) << endl;
					break;
				}
				else
					cout << (trav->decision) << endl;
			}
			string cond = trav->cond;//分支條件是字串：屬性=屬性下的分類，一下是對字串的操作
			string::size_type pos = cond.find("=");
			string pre = cond.substr(0, pos);//將字串前0-pos的位置的子字串賦予pre
			string post = cond.substr(pos + 1);
			if (independent[pre] == post)
				trav = trav->leftchild;
			else
				trav = trav->rightchild;
		}
	}
	ifs.close();
}

int main() {
	string inputFile = "watermelon.txt";
	readInput(inputFile);
	VEC_STATI stati,teststati;        //最原始的統計
	statistic(inputData, stati);
	//  for(int i=0;iprintTree();
	cout << "剪枝前使用該決策樹最多進行" << root->size - 1 << "次條件判斷" << endl;
	string testFile = "testwatermelon.txt";
	readtestInput(testFile);
	test(testFile, root,0);
	/*進行剪枝*/
	pruneprecision(root,testinputData);
	//root->printTree();
	priority_queue, MyCompare> pq;
	calalpha(root,pq);
	/*//檢驗一個是不是表面誤差增量最小的被剪掉了
	while(!pq.empty()){
	MyTriple triple=pq.top();
	pq.pop();
	cout<size - 1 << "次條件判斷" << endl;
	test(testFile, root,1);
	/*priority_queue pq;
	calalpha(root, pq);
	root->printTree();
	prune(root, pq);
	cout << "剪枝後使用該決策樹最多進行" << root->size - 1 << "次條件判斷" << endl;
	test(testFile, root);*/
	system("pause");
	return 0;
}

機器學習：決策樹cart演算法在分類與迴歸的應用（上）

#include #include #include #include #include #include #include #include #include #include #include using namespace std; //置信水平取0.95時的卡方表 const double CHI

機器學習：決策樹過擬合與剪枝，決策樹程式碼實現（三）

文章目錄楔子變數方法資料預處理剪枝獲取待剪集：針對ID3，C4.5的剪枝損失函式的設計基於該損失函式的演算法描述基於該損失函式的程式碼實

機器學習：決策樹及ID3,C4.5,CART演算法描述

文章目錄概念理解熵: 條件熵: 資訊增益，互資訊: 資訊增益比基尼指數 ID3演算法描述 C4.5演算法描述 CART (Classification and Regression Tree

機器學習：決策樹（基尼系數）

try matplot 代碼實現 sci bubuko div tro 兩種 () 一、基礎理解　1）公式 k：數據集中樣本類型數量； Pi：第 i 類樣本的數量占總樣本數量的比例　2）實例計算基尼系數 3 種情況計算基尼系數：基尼系數的性質與信息熵

優達機器學習：決策樹練習題

12 練習：決策樹準確性這裡優達的執行環境有個坑，就是他時而準確時而錯誤，所以測試的時候就一會兒是對的，一會兒是錯的，同樣的一個程式碼，感覺變數會混淆似的 import sys from class_vis import prettyPicture f

機器學習：決策樹（Decision Tree）

1. 理論概述：決策樹的內部節點表示一個特徵或屬性，葉子節點表示一個類別。輸入一個新樣本，從根節點開始按照節點說示的特徵劃分，直到劃分到葉子節點，該葉子節點即為類別。關於熵的基礎知識熵：

機器學習：決策樹（Decision Tree）

本部落格參考鄒博機器學習課件以及李航的《統計學習方法》，僅用於督促自己學習使用，如有錯誤，歡迎大家提出更正決策樹（decision tree）是一種基本的分類與迴歸方法。在分類問題中，它可以認為是if-then規則的集合，也可以認為是定義在特徵空間與

機器學習：決策樹

決策樹是機器學習中非常基礎的演算法，也是我研究生生涯學習到的第一個有監督模型，其中最基礎的ID3是1986年被髮表出來的，一經發表，之後出現了眾多決策樹演算法，不過最常見的還是C4.5和cart樹。在我的研究中，用不到決策樹，在天池或者Kaggle也很少用到單個決策樹，這種競賽一般用的整合演算法較多，畢竟是2

Python學習：基本數據類型與變量（中）與基礎之條件及循環（上）

sets 但是 while循環 spl view put 算數運算 sse 邏輯運算一.數據類型和變量 1.可變與不可變數據類型　　可變數據類型:在id不變的情況下，數據類型內部的元素可以改變　　列表　　字典　　不可變數據類型：value改變，id也跟著改變

【資料分析 R語言實戰】學習筆記第六章引數估計與R實現（上）

6.1點估計及R實現 6.1.1矩估計 R中的解方程函式: 函式及所在包：功能 uniroot()@stats：求解一元（非線性)方程 multiroot()@rootSolve：給定n個(非線性)方程，求解n個根 uniroot.all()@rootSolve：

【機器學習】決策樹（基於ID3,C4.5,CART分類迴歸樹演算法）—— python3 實現方案

內含3種演算法的核心部分. 沒有找到很好的測試資料. 但就理清演算法思路來說問題不大剪枝演算法目前只實現了CART迴歸樹的後剪枝. import numpy as np from collections import Counter from sklearn imp

【機器學習】決策樹（下)CART演算法分類樹、迴歸樹

CART同樣由特徵選擇、樹的生成、剪枝組成。既可以用於迴歸，又可以用於分類。 CART是在給定輸入隨機變數X條件下輸出隨機變數Y的條件概率分佈的學習方法。 CART假設決策樹是二叉樹，內部節點特徵的取值為“是“和“否“，左分支是取值為“是“的分支，右分支是取值為“否“的分支。這樣的決策樹

【機器學習筆記27】CART演算法-迴歸樹和分類樹

基本概念分類和迴歸樹(classification and regression tree, CART) 是應用廣泛的決策樹學習方法，由特徵選擇、樹的生成和剪枝組成，既可以用做分類也可以用作迴歸。迴歸樹迴歸樹的定義假設X和Y分別作為輸入和輸出變數，那麼

[Java][機器學習]用決策樹分類演算法對Iris花資料集進行處理

Iris Data Set是很經典的一個數據集，在很多地方都能看到，一般用於教學分類演算法。這個資料集在UCI Machine Learning Repository裡可以找到（還是下載量排第一的資料喲）。這個資料集裡面，每個資料都包含4個值(sepal len

機器學習——十大資料探勘之一的決策樹CART演算法

本文始發於個人公眾號：TechFlow，原創不易，求個關注今天是**機器學習專題**的第23篇文章，我們今天分享的內容是十大資料探勘演算法之一的CART演算法。 CART演算法全稱是**Classification and regression tree**，也就是分類迴歸樹的意思。和之前介紹

機器學習_8.決策樹演算法

1.ID3演算法預備知識 1.資訊熵： 2.資訊增益演算法內容引入了資訊理論中的互資訊（資訊增益）作為選擇判別因素的度量，即：以資訊增益的下降速度作為選取分類屬性的標準，所選的測試屬性是從根節點到當前節點的路徑上從沒有

【機器學習】決策樹演算法（二）— 程式碼實現

#coding=utf8 ‘’’ Created on 2018年11月4日 @author: xiaofengyang 決策樹演算法：ID3演算法 ‘’’ from sklearn.feature_extraction import DictVectorize

機器學習方法(四)：決策樹Decision Tree原理與實現技巧

歡迎轉載，轉載請註明：本文出自Bin的專欄blog.csdn.net/xbinworld。技術交流QQ群：433250724，歡迎對演算法、技術、應用感興趣的同學加入。前面三篇寫了線性迴歸，lasso，和LARS的一些內容，這篇寫一下決策樹這個經典的分

機器學習之決策樹機器學習之K-近鄰演算法

　　都說萬事開頭難，可一旦開頭，就是全新的狀態，就有可能收穫自己未曾預料到的成果。從2018.12.28開始，決定跟隨《機器學習實戰》的腳步開始其征程，記錄是為了更好的監督、理解和推進，學習過程中用到的資料集和程式碼都將上傳到github 　　機器學習系列部落格：（1）機器學習之K-近鄰演算法

機器學習：樸素貝葉斯分類器，決策函式向量化處理，mask使用技巧

文章目錄前面實現的樸素貝葉斯分類器，決策函式是非向量化的：藉助於numpy向量化處理，相當於平行計算，注意mask使用技巧，用途較廣：前面實現的樸素貝葉斯分類器，決策函式是非向量化的：前面提到過大資料處理，儘量避免個人的遍歷等一些函式

機器學習：決策樹cart演算法在分類與迴歸的應用（上）

相關推薦