1. 程式人生 > >基於哈夫曼編碼完成的檔案壓縮及解壓

基於哈夫曼編碼完成的檔案壓縮及解壓

這幾天在較為認真的研究基於哈夫曼編碼的檔案壓縮及解壓,費了點時間,在這分享一下:

這裡用鏈式結構,非順序表結構;

檔案壓縮:

1.獲取檔案資訊(這裡採用TXT格式文字);

2.壓縮檔案;

3.寫配置檔案(便於解壓時用,無非就是存放原檔案的索引之類的,比如說,檔案中某個字元出現的個數,記錄下來)

4.解壓縮,使用壓縮後的檔案和配置檔案解壓檔案;

5.用比對軟體,比對解壓後的檔案和原始檔是否相同;

下面慢慢解析:

先看一個檔案資訊類:

typedef long long LongType;
struct FileInfo
{
	unsigned char _ch;       //字元
	LongType _count;         //字元出現次數
	string _code;            //字元對應的哈夫曼編碼 

	FileInfo(unsigned char ch = 0)
		:_ch(ch)
		,_count(0)
	{}


	FileInfo operator+(const FileInfo& x)
	{
		FileInfo tmp;
		tmp._count = this->_count + x._count;
		return tmp;
	}

	bool operator !=(const FileInfo& x) const
	{
		return this->_count != x._count;
	}
};

bool operator<(const FileInfo info1,const FileInfo info2)
{
	return info1._count < info2._count;
}
此為一個檔案資訊的類結構,包含字元,字元對應出現的次數,以及這個字元對應的哈夫曼編碼(能看到這篇部落格的星弟,對哈夫曼編碼不會陌生,這裡不再強調)

除了統計字元出現的次數及哈夫曼編碼,還完成了幾個運算子的過載

要獲取哈夫曼編碼,就得建立哈夫曼樹,建立哈夫曼樹用最小堆取操作,以下是最小堆建立過程

// 小堆
template<class T>
struct Less
{
	bool operator() (const T& l, const T& r)
	{
		return l < r; // operator<
	}

};

template<class T>
struct Greater
{
	bool operator() (const T& l, const T& r)
	{
		return l > r; // operator<
	}
};

template<class T, class Compare = Less<T>>
class Heap
{
public:
	Heap()
	{}

	Heap(const T* a, size_t size)
	{
		for (size_t i = 0; i < size; ++i)
		{
			_arrays.push_back(a[i]);
		}

		// 建堆
		for(int i = (_arrays.size()-2)/2; i >= 0; --i)
		{
			AdjustDown(i);
		}
	}

	void Push(const T& x)
	{
		_arrays.push_back(x);
		AdjustUp(_arrays.size()-1);
	}

	void Pop()
	{
		assert(_arrays.size() > 0);
		swap(_arrays[0], _arrays[_arrays.size() - 1]);
		_arrays.pop_back();

		AdjustDown(0);
	}

	T& Top()
	{
		assert(_arrays.size() > 0);
		return _arrays[0];
	}

	bool Empty()
	{
		return _arrays.empty();
	}

	int Size()
	{
		return _arrays.size();
	}

	void AdjustDown(int root)
	{
		int child = root*2 + 1;
		// 	
		Compare com;
		while (child < _arrays.size())
		{
			// 比較出左右孩子中小的那個
			if (child+1<_arrays.size() &&
				*_arrays[child+1] < _arrays[child])
			//if(child+1<_arrays.size() &&
			//	com(_arrays[child+1],_arrays[child]))
			{
				++child;
			}

			if(*_arrays[child] < _arrays[root])
			//if(com(_arrays[child],_arrays[root]))
			{
				swap(_arrays[child], _arrays[root]);
				root = child;
				child = 2*root+1;
			}
			else
			{
				break;
			}
		}
	}

	void AdjustUp(int child)
	{
		int parent = (child-1)/2;

		//while (parent >= 0)
		while (child > 0)
		{
			if (*_arrays[child] < _arrays[parent])
			{
				swap(_arrays[parent], _arrays[child]);
				child = parent;
				parent = (child-1)/2;
			}
			else
			{
				break;
			}
		}
	}


public:
	vector<T> _arrays;
};
最小堆裡也完成了很多介面,包括push  pop等

然後就是幾個壓縮和解壓的函式介面

1.根據哈夫曼樹獲取哈夫曼變慢:

	void _GenerateHuffmanCode(HuffmanTreeNode<FileInfo>* root)
	{
		if (root == nullptr)
		{
			return;
		}

		_GenerateHuffmanCode(root->_left);
		_GenerateHuffmanCode(root->_right);

		//當前節點為葉子節點為空  才生成哈夫曼編碼
		if (root->_left == nullptr && root->_right == nullptr)
		{
			HuffmanTreeNode<FileInfo>* cur = root;
			HuffmanTreeNode<FileInfo>* parent = cur->_parent;
			string& code = _infos[cur->_weight._ch]._code;

			while (parent)
			{
				if (parent->_left == cur)
				{
					code += '1';
				}
				else if (parent->_right == cur)
				{
					code += '0';
				}
				cur = parent;
				parent = cur->_parent;
			}
			reverse(code.begin(), code.end());
		}
	}
2.根據最小堆建立哈夫曼樹;
void CreateTree(T *a, size_t size, const T& invalid)
	{
		assert(a);
		Heap<HuffmanTreeNode<T>*> s1;  //草 終於發現問題  在這裡   (堆裡放的是指標,型別一定要對)

		//找兩個最小的元素
		for (size_t i = 0; i < size; ++i)
		{
			if (a[i] != invalid)
			{
			HuffmanTreeNode<T>* node = new HuffmanTreeNode<T>(a[i]);
				s1.Push(node);
			}
		}

		while (s1.Size() > 1)
		{
			HuffmanTreeNode<T>* left = s1.Top();
			s1.Pop();
			HuffmanTreeNode<T>* right = s1.Top();
			s1.Pop();


			HuffmanTreeNode<T>* parent = new HuffmanTreeNode<T>(left->_weight + right->_weight);

		
			parent->_left = left;
			parent->_right = right;

			left->_parent = parent;
			right->_parent = parent;

			s1.Push(parent);
		}
		_root = s1.Top();
		s1.Pop();
	}
3.讀取文字檔案中的一行:
	bool _ReadLine(FILE *fOutLogFile, string& line)
	{
		char ch = fgetc(fOutLogFile);
		if (feof(fOutLogFile))
			return false;
		else
		{
			if (ch == '\n')
			{
				line += ch;
				ch = fgetc(fOutLogFile);
			}

			while (ch != '\n')
			{
				line += ch;
				ch = fgetc(fOutLogFile);
			}
			return true;
		}
	}

4.檔案壓縮
	//檔案壓縮
	bool Compress(const char* filename)
	{
		//1.開啟一個檔案,統計檔案字元出現的次數
		//2.生成對應的哈弗曼編碼
		//3.壓縮檔案
		//4.寫配置檔案,方便解壓縮

		assert(filename);
		FILE *fOut = fopen(filename, "rb");
		assert(fOut);

		//統計檔案字元出現的次數
		unsigned char ch = fgetc(fOut);
		while (!feof(fOut))  //檔案結束
		{
			_infos[ch]._count++;
			ch = fgetc(fOut);
		}

		HuffmanTree<FileInfo> ht;
		FileInfo invalid;
		ht.CreateTree(_infos, 256, invalid);

		//哈夫曼編碼
		_GenerateHuffmanCode(ht.GetRoot());

		string compressFile = filename;
		compressFile += ".huf";

		//壓縮後的檔名 字尾為《輸入檔名+.huf》
		FILE *finCompress = fopen(compressFile.c_str(), "wb"); //獲取string中的C字串
		assert(finCompress);

		fseek(fOut, 0, SEEK_SET);//將檔案指標移到開頭
		char cha = fgetc(fOut);
		unsigned char inch = 0;
		int index = 0;  //一個位元組的八位
		while (!feof(fOut))
		{
			string& code = _infos[(unsigned char)cha]._code;

			for (size_t i = 0; i < code.size(); ++i)
			{
				inch <<= 1;     //低位向高位進
				if (code[i] == '1')
				{
					inch |= 1;
				}

				if (++index == 8)
				{
					fputc(inch, finCompress); //夠8位,裝進檔案
					index = 0;   //重新一輪開始
					inch = 0;
				}
			}
			cha = fgetc(fOut);
		}

		fclose(fOut);

		//如果index = 0 說明 上邊8位剛好存滿 不等 下一個自己又出來了
		if (index != 0)   //處理最後一個字元不夠的問題
		{
			inch <<= (8 - index); //最高位必須裝上 後邊的浪費掉
			fputc(inch, finCompress);
		}

		fclose(finCompress);
	}

5.寫配置檔案:
string logFile = filename;
		logFile += ".log";
		
		FILE *Log = fopen(logFile.c_str(), "wb");
		assert(Log);

		string chInfo;

		char str[128] = {0}; //沒空間 不可以

		for (size_t i = 1; i < 256; ++i)
		{
			if (_infos[i]._count > 0)
			{
				chInfo += _infos[i]._ch;
				chInfo += ',';
				chInfo += _itoa(_infos[i]._count,str,10);
				chInfo += '\n';
				fputs(chInfo.c_str(), Log);
				chInfo.clear();
			}
		}

		fclose(Log);

6.最後的檔案解壓:
//重構檔案
	void _RestoreFiles(HuffmanTreeNode<FileInfo> *root, const char* Fileneme,long long size)
	{
		assert(root);

		//原壓縮檔案
		string name = Fileneme;
		name += ".huf";
		
		FILE* Out = fopen(name.c_str(),"rb");
		assert(Out);
		
		string restorefilename = Fileneme;
		restorefilename += ".over";
		FILE *over = fopen(restorefilename.c_str(),"wb");
		assert(over);

		int pos = 8;
		long long poss = size;

		unsigned char chz = fgetc(Out);
		while (poss>0)
		{
			HuffmanTreeNode<FileInfo>* cur = nullptr;
			cur = root;
			while (cur->_left != nullptr || cur->_right != nullptr)
			{
				pos--;
				unsigned char temp = chz >> pos;
				int ch = 1 & temp;
				if (ch == 0)
				{
					cur = cur->_right;
				}

				else if (ch == 1)
				{
					cur = cur->_left;
				}

				if (pos == 0)
				{
					chz = fgetc(Out);
					pos = 8;
				}
			}
			fputc(cur->_weight._ch, over);

			poss--;
		}

		fclose(Out);
		fclose(over);
	}

	
	void UnCompress(const char* Fileneme)//解壓縮
	{
		//1.開啟日誌檔案
		//2.根據資訊還原哈夫曼樹
		//3.還原資訊;
		string UnCompressneme = Fileneme;
		UnCompressneme += ".log";
		FILE *fOutLogFile = fopen(UnCompressneme.c_str(), "rb");
		assert(fOutLogFile);

		string line;
		while (_ReadLine(fOutLogFile, line))
		{
			unsigned char ch = line[0];
			_infos[ch]._count = atoi(line.substr(2).c_str());
			line.clear();
		} 

		HuffmanTree<FileInfo> f;
		FileInfo invalid;
		f.CreateTree(_infos, 256, invalid);

		//根據重建的哈夫曼樹 還原檔案;
		long long size = f.GetRoot()->_weight._count;
		_RestoreFiles(f.GetRoot(), Fileneme,size);
	}
到此,此專案基本完成;如遇問題,希望留言,隨時解答,如有見解,跪求賜教!