c++ 資料結構 *** 哈夫曼樹的應用——壓縮軟體

阿新 • • 發佈：2019-01-01

資料結構的作業，壓縮軟體用的，具體寫的過程中有哪些問題在程式裡說吧。

標頭檔案與常量部分：

利用char的8位，來儲存檔案裡的元素。每次取出檔案中的8位並記錄這八位出現的次數用來進行哈夫曼數的建立。

#include<iostream>
#include<stack>
#include<string>
#include<list>
#include<iterator>

using namespace std;

//利用char進行8位一取，這樣不管是字元型別還是漢字或者其他格式都可以進行壓縮
const unsigned int N = 256;//char共有8位，那麼一共最多有256個可能的字元，葉節點數目
const unsigned int M = 2 * N;//對於擁有n個葉結點的huffman樹，一共有2*n-1個結點，陣列最後一位不存數，供select函式選擇使用
const unsigned long long MAX = 0xffffffffffff;

三個類的宣告部分： Buffer類用來從檔案中讀取8位字元或者寫入8位字元。因為每次都只能從檔案中讀取或者寫入一個字元，所以利用Buffer類進行緩衝。 treeNode類是哈夫曼樹的結點類，其中儲存了樹中每個結點出現的頻率與其左右子結點與父節點。 HuffmanTree類是哈夫曼樹類，可以進行壓縮或解壓。

class Buffer {//緩衝類，對檔案進行讀取/寫入操作的時候通過Buffer類進行整個字元的讀取/寫入
public:
	unsigned int bits;//實際位元組長度
	char ch;//位元組
};

class HuffmanTree;//huffman樹類宣告

class treeNode {//huffman樹結點類
	friend HuffmanTree;
private:
	unsigned long long weight;//該結點一共出現了多少次
	unsigned int right, left;
	unsigned int parents;
};

class HuffmanTree {//huffman樹類
private:
	treeNode nodes[M];//儲存樹的每個結點資訊，最多M個
	unsigned int leaf[N];//儲存葉節點字元資訊，最多N個
	unsigned int index[N];//儲存葉節點的下標值
	char* leafCode[N];//儲存葉節點的編碼資訊
	FILE* input, *output;
	unsigned int num;//實際上葉結點個數
	unsigned long long size;//一共有多少個字元
	Buffer buf;//輸入輸出緩衝

	//輔助函式
	void write(unsigned int i);//向檔案中寫入一個bite
	void write(unsigned int num, unsigned int bits);//向檔案中寫入bits位的num數
	void writerest();//如果最後buf裡面還有字元沒有寫入，用這個函式一起寫入
	void read(unsigned int &i);//從檔案中讀取一位bite，用i輸出
	void read(unsigned int &num, unsigned int bits);//從檔案中讀取bits位的數，並用num輸出

	//Encode輔助函式
	void enSieve();//讀取檔案，並且統計檔案中字元數
	void select(unsigned int pos,unsigned int &t1,unsigned int &t2);//在0——pos間選擇兩個權重最小的，用t1、t2輸出
	void enSetTree();//對需要被壓縮的檔案中的字元建立huffman樹，並且完善每個結點的編碼資訊

	//Decode輔助函式
	void deSieve();//讀取需要被解壓檔案，並建造其哈夫曼樹

public:
	void Encode();//對檔案編碼
	void Decode();//對檔案譯碼
};

HuffmanTree類的實現：

讀取/寫入輔助函式：

主要是利用Buffer類中的bits來記錄ch中實際有效的位數。在write的時候，如果ch中實際位數已經足夠8位，那麼將ch寫入檔案，然後再將bits置8，ch置0；在read的時候，如果ch的實際位數已經等於0，那麼從檔案中fget（）一個8位數給ch，同時bits置8。同時，有可能在寫入到最後的時候，bits實際上並不等於0，那麼用writerest將剩下的ch全部寫入。

//輔助函式塊
void HuffmanTree::write(unsigned int i) {//向檔案中寫入一個bite
	buf.ch = (buf.ch << 1) + i;//對ch加上一位
	buf.bits++;//ch的八位bite的實際使用量加一
	if (buf.bits == 8) {//如果全部用完，那麼將buf.ch輸入進output中，並將buf.bits,buf.ch置0重新開始新一輪計數
		fputc(buf.ch, output);
		buf.bits = 0;
		buf.ch = 0;
	}
}
void HuffmanTree::write(unsigned int target, unsigned int bits) {//向檔案中寫入bits位的num數
	/*傻逼做法
	for (unsigned int i = 0; i < bits; ++i) {//一位一位的放進去
		write((target & 128)>>7 );//從高位依次放到低位
		target <<= 1;
	}
	*/

	stack<unsigned int> s;
	unsigned int i, bit;
	for (i = 1; i <= bits; i++) {
	s.push(target & 1);
	target = (target >> 1);
	}
	for (i = 1; i <= bits; i++) {
	bit = s.top();
	write(bit);
	s.pop();
	}
}
void HuffmanTree::writerest() {//如果最後buf裡面還有字元沒有寫入，用這個函式一起寫入
	unsigned int now = buf.bits;
	if (now>0)
		for (unsigned int i = 0; i<8 - now; i++)write(0);
}
void HuffmanTree::read(unsigned int &i) {//從檔案中讀取一位bite，用i輸出
	if (buf.bits == 0) {
		buf.bits = 8;
		buf.ch = fgetc(input);
	}
	i = (buf.ch & 128) >> 7;
	buf.bits--;
	buf.ch <<= 1;

}
void HuffmanTree::read(unsigned int &target, unsigned int bits) {//從檔案中讀取bits位的數，並用num輸出
	unsigned int tmp;
	target = 0;
	for (unsigned int i = 0; i < bits; ++i) {
		read(tmp);
		target = (target << 1) + tmp;
	}
}

EnCode()函式及其輔助函式的實現部分：

昨晚在這部分出現了一個錯誤點，在enSetTree函式的實現部分，如下程式碼：

	for (int i = 0; i < N; ++i)
		leafCode[i] = NULL;

被我寫成了：

	for (int i = 0; i <= N; ++i)
		leafCode[i] = NULL;

然後就導致了FILE* input莫名其妙被置成了NULL，之後找到錯誤之後感慨了一下不要亂置NULL。。下標一定要看清。。。

//Encode輔助函式塊
void HuffmanTree::enSieve() {//讀取檔案，並且統計檔案中字元數
	char inName[1000], outName[1000];
	cout << "Input file name that you want to code:";
	cin >> inName;
	cout << "Input target file name:";
	cin >> outName;
	if ((input = fopen(inName, "rb")) == NULL) {
		cout << "Can not open file." << endl;
		system("pause");
		exit(1);
	}
	if (feof(input)) {
		cout << "Empty source file" << endl;
		system("pause");
		exit(1);
	}
	if ((output = fopen(outName, "wb")) == NULL) {
		cout << "Can not open file." << endl;;
		system("pause");
		exit(1);
	}

	//從檔案中讀取字元，並統計字元出現頻率
	rewind(input);
	unsigned int ch;
	size = 0;
	for (unsigned int i = 0; i < N; ++i) {
		leaf[i] = 0;
		index[i] = 0;
	}
	for (unsigned int i = 0; i < M; ++i) {
		nodes[i].weight = 0;
		nodes[i].left = nodes[i].right = nodes[i].parents = M-1;
	}
	ch = fgetc(input);
	while (!feof(input)) {
		leaf[ch]++;
		size++;
		ch = fgetc(input);
	}

	//nodes[N-1].weight置為最大
	nodes[M-1].weight = MAX;

	//篩掉出現頻率為0的字元，並寫入nodes，index陣列，並修改num值
	num = 0;
	for (unsigned int i = 0; i < N; ++i)
		if (leaf[i]) {
			nodes[num].weight = leaf[i];
			leaf[i] = num;
			index[num] = i;
			num++;
		}
	if (!num) {
		cout << "doesn't have a word" << endl;
		system("pause");
		exit(1);
	}
}


void HuffmanTree::select(unsigned int pos, unsigned int &t1,unsigned int &t2) {//在0——pos間選擇兩個權重最小的，用t1、t2輸出
	t1 = M-1, t2 = M-1;
	for (unsigned int i = 0; i < pos; ++i) {
		if (nodes[i].weight < nodes[t1].weight&&nodes[i].parents==M-1)
			t1 = i;
	}
	for (unsigned int i = 0; i < pos; ++i) {
		if (nodes[i].weight < nodes[t2].weight&&i != t1&&nodes[i].parents == M-1)
			t2 = i;
	}
}


void HuffmanTree::enSetTree() {//對需要被壓縮的檔案中的字元建立huffman樹，並且完善每個結點的編碼資訊
	//建立huffman樹
	for (unsigned int i = num; i < num * 2 - 1; ++i) {
		unsigned int t1, t2;
		select(i, t1, t2);
		nodes[i].weight = nodes[t1].weight + nodes[t2].weight;
		nodes[i].left = t1;
		nodes[i].right = t2;
		nodes[t1].parents = nodes[t2].parents = i;
	}

	for (int i = 0; i < N; ++i)
		leafCode[i] = NULL;

	//對每個結點進行編碼
	unsigned int start, c, f, i;
	char *cd = new char[num];                 //編碼臨時變數
	for (i = 0; i < N; i++)
		if (leafCode[i] != NULL) {
			delete[]leafCode[i];  //釋放儲存空間
			leafCode[i] = NULL;
		}
	cd[num - 1] = '\0';         //編碼結束符
	for (i = 0; i < num; i++) {    //逐位求Huffman編碼
		start = num - 1;        //編碼結束符位置
		for (c = i, f = nodes[i].parents; f != M - 1; c = f, f = nodes[c].parents) { //從葉到根求編碼
			if (nodes[f].left == c)cd[--start] = '0';
			else cd[--start] = '1';
		}
		leafCode[i] = new char[num - start];      //為第i個字元編碼分配空間
		strcpy(leafCode[i], &cd[start]);            //從cd複製編碼到HuffmanCode
	}
	delete cd;
}


void HuffmanTree::Encode() {//對檔案編碼
	enSieve();//初始化input，output；統計檔案中字元
	enSetTree();//根據enSieve完成huffman樹的建立與對字元進行編碼

	rewind(output);
	rewind(input);
	//向output的開頭中寫入樹結構
	buf.bits = 0;
	buf.ch = 0;
	fwrite(&size,sizeof(unsigned long long),1,output);//寫入size
	write(num, 8);//將樹結構中的葉結點個數寫入
	for (unsigned int i = 0; i < num; ++i)//將樹節點中的葉節點寫入
		fwrite(&index[i], sizeof(char), 1, output);
	//選擇num最大需要多少位來儲存
	unsigned maxbit = 1;
	unsigned int tmp = num * 2 - 1;
	while (tmp) {
		maxbit++;
		tmp >>= 1;
	}
	for (unsigned int i = num; i < num * 2 - 1; ++i) {//寫入左右孩子資訊
		write(nodes[i].left, maxbit);
		write(nodes[i].right, maxbit);
	}

	//寫入編碼資訊
	unsigned int ch;
	ch = fgetc(input);
	while (!feof(input)) {
		unsigned int start = 0;//判斷對ch的編碼leafCode[loc]的起始位置
		while (leafCode[leaf[ch]][start] != '\0') {
			if (leafCode[leaf[ch]][start] == '1')write(1);
			else write(0);
			++start;
		}
		ch = fgetc(input);
	}
	writerest();//寫入剩下的字元
	cout << "Done!\n\n";
	fclose(input);
	fclose(output);
}

DeCode()函式及其輔助函式的實現部分：

//Decode輔助函式塊
void HuffmanTree::deSieve() {//讀取需要被解壓檔案，並建造其哈夫曼樹
	char inName[1000], outName[1000];
	cout << "Input file name that you want to decode:";
	cin >> inName;
	cout << "Input target file name:";
	cin >> outName;
	if ((input = fopen(inName, "rb")) == NULL) {
		cout << "Can not open file." << endl;
		system("pause");
		exit(1);
	}
	if (feof(input)) {
		cout << "Empty source file" << endl;
		system("pause");
		exit(1);
	}
	if ((output = fopen(outName, "wb")) == NULL) {
		cout << "Can not open file." << endl;;
		system("pause");
		exit(1);
	}

	//開始讀取樹結構
	rewind(input);
	for (unsigned int i = 0; i < M; ++i) {
		nodes[i].parents = nodes[i].right = nodes[i].left = N-1;
	}

	buf.bits = 0; //清空緩衝區
	buf.ch = 0;
	fread(&size,sizeof(unsigned long long),1, input);//讀取size
	read(num, 8);//讀取樹結構中的葉結點個數
	if (num == 0)num = 256;
	for (unsigned int i = 0; i < num; ++i)//讀取樹節點中的葉節點
		fread(&index[i], sizeof(char), 1, input);
	//選擇num最大需要多少位來儲存
	unsigned maxbit = 1;
	unsigned int tmp = num * 2 - 1;

	while (tmp) {
		maxbit++;
		tmp >>= 1;
	}

	for (unsigned int i = num; i < num * 2 - 1; ++i) {//讀取左右孩子資訊
		read(nodes[i].left, maxbit);
		read(nodes[i].right, maxbit);
		nodes[nodes[i].left].parents = nodes[nodes[i].right].parents = i;
	}
}


void HuffmanTree::Decode() {
	deSieve();

	//開始譯碼
	rewind(output);
	unsigned int tmp;
	read(tmp);
	for (int i = 0; i < size; ++i) {
		unsigned int loc = 2 * num - 2;
		while ((nodes[loc].left != N-1 || nodes[loc].right != N-1) && !feof(input)) {
			if (tmp == 0)loc = nodes[loc].left;
			else loc = nodes[loc].right;
			read(tmp);
		}
		fputc(index[loc], output);
	}
	cout << "Done!\n\n";
	fclose(input);
	fclose(output);
}

寫下來大概感受就是注意二進位制的長短，以及不要寫的頭暈了。。。長度各種亂。。。奇葩。。

測試部分：

#pragma warning(disable:4996)
#include<iostream>
#include<cstdio>
#include<cmath>
#include<stack>
#include<queue>
#include<cstring>
#include<sstream>
#include<set>
#include<string>
#include<iterator>
#include<vector>
#include<map>
#include<algorithm>
#include"HuffmanTree.h"
using namespace std;

int main(void) {
	cout << sizeof(char) << endl;
	char choose = '1';
	while (choose != '3') {
		HuffmanTree tree;
		cout << "1.Huffman Encode" << endl;
		cout << "2.Huffman Decode" << endl;
		cout << "3.exit" << endl;
		cin >> choose;
		switch (choose) {
		case'1':tree.Encode(); break;
		case'2':tree.Decode(); break;
		default:break;
		}
	}
//	system("pause");
	return 0;
}

測試效果：

第一波（純文字）：