1. 程式人生 > >C++實現哈夫曼編碼--構建哈夫曼編碼樹

C++實現哈夫曼編碼--構建哈夫曼編碼樹

     哈夫曼編碼分為動態和靜態之分。靜態哈夫曼編碼需要統計和計算每個欄位的權重(比如文字'A'字母出現的次數),效率會很低,特別是壓縮大檔案,基本是不現實的。只是,理解靜態哈夫曼編碼是基礎,理解壓縮和解壓思想。

    本文使所使用的演算法和構建思路跟我們通常的資料結構和演算法書本里面的介紹思路是一致的,這個思路也是最原始和直觀的,比較容易理解,本質上是一個經典的貪心演算法例子。就是每次從各個子樹中(原始的單個元素我們也看成是一棵只有root節點的一棵樹),選擇2個權重最大的或者權重最小,最大或者最小的為左子樹,次大或者次小的為右子樹,左右子樹合併成一課樹。當然,合併過程中出現的子樹,它的權重是左右子樹的權重之和。這樣如此迴圈往復選取,直到最後只有一棵樹,即生成哈夫曼編碼樹。有了編碼樹,每個葉子節點的編碼也就明顯了。壓縮過程就是編碼位域替換文字欄位,解壓縮就是掃描編碼樹和檔案的位域,如果是0走到左子樹,如果是1走到右子樹,到了葉子節點,那麼就解壓出一個文字欄位了。如此迴圈,即完整的解壓出全部內容。

      為了完整理解該思路,我這邊的實現並沒有考慮效率等,完全模擬編解碼的最原始的思路來實現,這樣非常便於理解。實現思路是把欄位的出現次數(權重)按從小到大排序,每次選擇2個最小的構成二叉樹,如此迴圈直到只剩最後一棵二叉樹,即編碼樹。每個欄位都是一個位元組,為0-255的數字。

      建立二叉樹後,這裡需要遍歷2遍編碼樹,才可以獲取到每個欄位的編碼,關於這點,如果有效率更高的遍歷方法,多謝告知。如下是實現程式碼。

#include "HanfuMan.h"

#define TEST_DATA_LEN 16

void InsertSort(PDATA_INFO pDataInfo,int len,unsigned char data,unsigned short dataTimes);

BOOL TestHanfuMan()
{
	BOOL bRet = FALSE;
	unsigned char *p = (unsigned char*)new unsigned char[TEST_DATA_LEN];
	if(!p)
	{
		return FALSE;
	}
	memset(p,0,TEST_DATA_LEN);

	srand(time(NULL));
	for(int i=0;i<TEST_DATA_LEN;i++)
	{
		p[i] = rand()%256; //隨機生成資料內容
	}

	//統計0-255之間每個數字出現的次數
	unsigned short times[256] = {0};
	for(int i=0;i<TEST_DATA_LEN;i++)
	{
		times[p[i]]++; 
	}

	int count = 0;//統計有出現的數字個數
	for(int i=0;i<256;i++)
	{
		if(times[i])
		{
			count++;
		}
	}

	PDATA_INFO pDataInfo = (PDATA_INFO)new DATA_INFO[count];
	if(!pDataInfo)
	{
		goto RET;
	}
	memset(pDataInfo,0,count*sizeof(DATA_INFO));

	int len = 0;
	for(int i=0;i<256;i++)
	{
		if(times[i])
		{
			//使用插入排序,把0-255之間出現的數字的次數進行從小到大排序
			InsertSort(pDataInfo,len,i,times[i]);
			len++;
		}
	}

	PHANFUMAN_TREE tree = CreateHanfuManTree(pDataInfo,len);
	EnumHanfuManCode(tree);
	DestroyTree(tree);

RET:
	if(pDataInfo)
	{
		delete [] pDataInfo;
	}
	if(p)
	{
		delete [] p;
	}

	return bRet;
}

void InsertSort(PDATA_INFO pDataInfo,int len,unsigned char data,unsigned short dataTimes)
{
	if(0 == len)
	{
		pDataInfo[0].data = data;
		pDataInfo[0].times = dataTimes;
		return;
	}

	int inserIndex = 0;
	//使用插入排序
	for(inserIndex=0;inserIndex<len;inserIndex++)
	{
		if(dataTimes >= pDataInfo[inserIndex].times)
		{
			continue;
		}
		break;
	}

	for(int i=len-1;i>=inserIndex;i--)
	{
		memcpy(&pDataInfo[i+1],&pDataInfo[i],sizeof(DATA_INFO));
	}
	//插入新資料
	pDataInfo[inserIndex].data = data;
	pDataInfo[inserIndex].times = dataTimes;
}

void InsertSortTree(PHANFUMAN_TREE *pSubTree,int subTreeCount,PHANFUMAN_TREE insertTree)
{
	if(0 == subTreeCount)
	{
		pSubTree[0] = insertTree;
		return;
	}

	int inserIndex = 0;
	//使用插入排序
	for(inserIndex=0;inserIndex<subTreeCount;inserIndex++)
	{
		if(insertTree->weight >= (pSubTree[inserIndex])->weight)
		{
			continue;
		}
		break;
	}

	for(int i=subTreeCount-1;i>=inserIndex;i--)
	{
		pSubTree[i+1] = pSubTree[i];
	}
	//插入新資料
	pSubTree[inserIndex] = insertTree;
}

void RefreshSubTrees(PHANFUMAN_TREE *pSubTree,int subTreeCount,PHANFUMAN_TREE mergeTree)
{
	for(int i=2;i<subTreeCount;i++)
	{
		pSubTree[i-2] = pSubTree[i];
	}
	
	//插入排序,按照權重的從小到大順序排序
	InsertSortTree(pSubTree,subTreeCount-2,mergeTree);
}

//合併2棵子樹,pSubTree1的權重預設比pSubTree2的小
PHANFUMAN_TREE MergeTree(PHANFUMAN_TREE pLeftSubTree,PHANFUMAN_TREE pRightSubTree)
{
	PHANFUMAN_TREE mergeRoot = new HANFUMAN_TREE;

	if(!mergeRoot)
	{
		return NULL;
	}

	mergeRoot->data = 0;

	pLeftSubTree->parent = mergeRoot;
	mergeRoot->weight = pLeftSubTree->weight;
	//pLeftSubTree 預設不為空
	if(pRightSubTree)
	{
		mergeRoot->weight += pRightSubTree->weight;
		pRightSubTree->parent = mergeRoot;
	}

	mergeRoot->parent = NULL;
	mergeRoot->left = pLeftSubTree;
	mergeRoot->right = pRightSubTree;

	return mergeRoot;
}

//建立新樹,用於建立葉子節點
PHANFUMAN_TREE CreateLeaf(PDATA_INFO pDataInfo)
{
	PHANFUMAN_TREE leafTree = new HANFUMAN_TREE;

	if(!leafTree)
	{
		return NULL;
	}

	leafTree->data = pDataInfo->data;
	leafTree->weight = pDataInfo->times;

	leafTree->parent = NULL;
	leafTree->left = NULL;
	leafTree->right = NULL;

	return leafTree;
}

//建立哈夫曼編碼樹
PHANFUMAN_TREE CreateHanfuManTree(PDATA_INFO pDataInfo,int len)
{
	if(len<=0)
	{
		return NULL;
	}

	int dataIndex = 0;
	//最多隻可能出現len+1/2個子樹,用於儲存編碼過程可能出現的全部子樹的根節點指標
	PHANFUMAN_TREE *pSubTree = (PHANFUMAN_TREE*) new PHANFUMAN_TREE[(len+1)/2];
	PHANFUMAN_TREE root = NULL;
	int subTreeCount = 0; //子樹的個數
	HANFUMAN_SELECT_HELPER  selectHelper;

	memset(pSubTree,0,sizeof(PHANFUMAN_TREE)*((len+1)/2));

	while(dataIndex<len)
	{
		//對比陣列中剩餘未編碼的資料和各個子樹選擇2個權重最小的,如果權重相同,優先選擇子樹中的
		//由於陣列和子樹都已經按照從小到大的順序,因此直接選取對比即可
		if(subTreeCount>=2)
		{
			selectHelper.firstMinIndex = 0;
			selectHelper.secondMinIndex = 1;	
		}
		else
		{
			if(subTreeCount>=1)
			{
				selectHelper.firstMinIndex = 0;
			}
		}

		if(-1 == selectHelper.firstMinIndex)
		{
			selectHelper.firstMinIndex = dataIndex;
			selectHelper.firstMinType = INDEX_TYPE_INFO;
			if(++dataIndex<len)
			{
				selectHelper.secondMinIndex = dataIndex++;
				selectHelper.secondMinType = INDEX_TYPE_INFO;
			}
		}
		else
		{
			if(pDataInfo[dataIndex].times < (pSubTree[selectHelper.firstMinIndex])->weight)
			{
				selectHelper.secondMinIndex = selectHelper.firstMinIndex;

				selectHelper.firstMinIndex = dataIndex;
				selectHelper.firstMinType = INDEX_TYPE_INFO;

				if( (++dataIndex<len) && ( pDataInfo[dataIndex].times < (pSubTree[selectHelper.secondMinIndex])->weight  ) )
				{
					selectHelper.secondMinIndex = dataIndex++;
					selectHelper.secondMinType = INDEX_TYPE_INFO;
				}
			}
			else
			{
				if( (-1==selectHelper.secondMinIndex) || (pDataInfo[dataIndex].times < (pSubTree[selectHelper.secondMinIndex])->weight))
				{
					selectHelper.secondMinIndex = dataIndex++;
					selectHelper.secondMinType = INDEX_TYPE_INFO;
				}
			}
		}//至此,已經選擇出了2個最小權重的

		if(INDEX_TYPE_TREE == selectHelper.firstMinType && INDEX_TYPE_TREE == selectHelper.secondMinType)
		{
			//合併2棵子樹
			PHANFUMAN_TREE mergeTree = MergeTree(pSubTree[0],pSubTree[1]);
			if(!mergeTree)
			{
				exit(0);
			}
			RefreshSubTrees(pSubTree,subTreeCount,mergeTree);

			subTreeCount--;
		}
		if(INDEX_TYPE_TREE == selectHelper.firstMinType && INDEX_TYPE_INFO == selectHelper.secondMinType)
		{
			PHANFUMAN_TREE newLeaf = CreateLeaf(&pDataInfo[selectHelper.secondMinIndex]);
			if(!newLeaf)
			{
				exit(0);
			}
			PHANFUMAN_TREE mergeTree = MergeTree(pSubTree[0],newLeaf);
			if(!mergeTree)
			{
				exit(0);
			}
			for(int i=1;i<subTreeCount;i++)
			{
				pSubTree[i-1] = pSubTree[i];
			}

			InsertSortTree(pSubTree,subTreeCount-1,mergeTree);//插入子樹後,子樹的數量不變
		}
		if(INDEX_TYPE_INFO == selectHelper.firstMinType && INDEX_TYPE_INFO == selectHelper.secondMinType)
		{
			PHANFUMAN_TREE leftLeaf = CreateLeaf(&pDataInfo[selectHelper.firstMinIndex]);
			if(!leftLeaf)
			{
				exit(0);
			}
			PHANFUMAN_TREE rightLeaf = CreateLeaf(&pDataInfo[selectHelper.secondMinIndex]);
			if(!leftLeaf)
			{
				exit(0);
			}
			PHANFUMAN_TREE mergeTree = MergeTree(leftLeaf,rightLeaf);
			if(!mergeTree)
			{
				exit(0);
			}
			InsertSortTree(pSubTree,subTreeCount,mergeTree);
			subTreeCount++; //插入子樹後,子樹的數量+1
		}
		if(INDEX_TYPE_INFO == selectHelper.firstMinType && INDEX_TYPE_TREE == selectHelper.secondMinType)
		{
			if(-1 == selectHelper.secondMinIndex)
			{
				PHANFUMAN_TREE leftLeaf = CreateLeaf(&pDataInfo[selectHelper.firstMinIndex]);
				if(!leftLeaf)
				{
					exit(0);
				}
				PHANFUMAN_TREE mergeTree = MergeTree(leftLeaf,NULL);
				if(!mergeTree)
				{
					exit(0);
				}
				InsertSortTree(pSubTree,subTreeCount,mergeTree);
				subTreeCount++; 
			}
			else
			{
				PHANFUMAN_TREE leftLeaf = CreateLeaf(&pDataInfo[selectHelper.firstMinIndex]);
				if(!leftLeaf)
				{
					exit(0);
				}
				PHANFUMAN_TREE mergeTree = MergeTree(leftLeaf,pSubTree[selectHelper.secondMinIndex]);
				if(!mergeTree)
				{
					exit(0);
				}
				for(int i=1;i<subTreeCount;i++)
				{
					pSubTree[i-1] = pSubTree[i];
				}

				InsertSortTree(pSubTree,subTreeCount-1,mergeTree);
			}
		}

		selectHelper.Init();
	}

	//合併sub trees
	while(subTreeCount>1)
	{
		//合併2棵子樹
		PHANFUMAN_TREE mergeTree = MergeTree(pSubTree[0],pSubTree[1]);
		if(!mergeTree)
		{
			exit(0);
		}
		RefreshSubTrees(pSubTree,subTreeCount,mergeTree);

		subTreeCount--;
	}

	//最後子樹中只剩下一課,這棵樹即為編碼樹
	PHANFUMAN_TREE tree = pSubTree[0];
	delete [] pSubTree;
	
	return tree;
}

//釋放樹
void DestroyTree(PHANFUMAN_TREE tree)
{
	if(!tree)
	{
		return;
	}

	DestroyTree(tree->left); //刪除左子樹
	DestroyTree(tree->right);//刪除右子樹

	delete tree; //刪除根節點
	tree = NULL;
}

//通過葉子的父節點向上
void PrintHanfuManCode(PHANFUMAN_TREE tree,int *codeLen)
{
	if(!tree)
	{
		return;
	}

	PHANFUMAN_TREE parent = tree->parent;
	if(!parent)
	{
		return;
	}

	PrintHanfuManCode(parent,codeLen);
	if(parent->left == tree)
	{
		(*codeLen)++;
		printf("0");
	}
	else
	{
		(*codeLen)++;
		printf("1");
	}
}


//通過二次遍歷編碼樹,列舉得到每個data的哈夫曼編碼
void EnumHanfuManCode(PHANFUMAN_TREE tree)
{
	if(!tree)
	{
		return;
	}

	//葉子節點
	if(!tree->left && !tree->right)
	{
		int codeLen = 0;
		printf("data value = 0x%2x    HanfuMan Code = ",tree->data);
		PrintHanfuManCode(tree,&codeLen);
		printf("   CodeLen = %d\r\n",codeLen);
		return;
	}
	
	if(tree->left)
	{
		EnumHanfuManCode(tree->left);
	}

	if(tree->right)
	{
		EnumHanfuManCode(tree->right);
	}
}
      標頭檔案內容如下:
#ifndef _HANFUMAN_H_
#define _HANFUMAN_H_

typedef struct _t_HANFUMAN_TREE
{
	unsigned char data;    //編碼的資料值,0-255之間,如果不是葉子節點,設定為0
	unsigned short weight; //編碼數字的權重,可以是出現的概率,這裡使用data出現的次數

	_t_HANFUMAN_TREE* parent;
	_t_HANFUMAN_TREE* left;
	_t_HANFUMAN_TREE* right;
}HANFUMAN_TREE,*PHANFUMAN_TREE;


#define INDEX_TYPE_TREE 0x00
#define INDEX_TYPE_INFO 0x01

typedef struct _t_HANFUMAN_SELECT_HELPER
{
	_t_HANFUMAN_SELECT_HELPER()
	{
		Init();
	}
	void Init()
	{
		firstMinIndex = -1;
		secondMinIndex = -1;
		firstMinType = INDEX_TYPE_TREE; //預設值為子樹型別
		secondMinType = INDEX_TYPE_TREE; //預設值為子樹型別
	}
	int firstMinIndex; 
	int secondMinIndex;
	unsigned char firstMinType;
	unsigned char secondMinType;
}HANFUMAN_SELECT_HELPER,*PHANFUMAN_SELECT_HELPER;

typedef struct _t_DATA_INFO
{
	unsigned char data;  
	unsigned short times; //data出現的次數
}DATA_INFO,*PDATA_INFO;

BOOL TestHanfuMan();

//建立哈夫曼編碼樹
PHANFUMAN_TREE CreateHanfuManTree(PDATA_INFO pDataInfo,int len);
void EnumHanfuManCode(PHANFUMAN_TREE tree);
void DestroyTree(PHANFUMAN_TREE tree);

#endif
         測試例子如下:
#include <Windows.h>
#include <stdio.h>
#include "HanfuMan.h"


int main(int agrc,char* argv[])
{
	TestHanfuMan();
	
	getchar();
	return 0;
}