基於哈夫曼編碼完成的檔案壓縮及解壓
阿新 • • 發佈:2019-01-10
這幾天在較為認真的研究基於哈夫曼編碼的檔案壓縮及解壓,費了點時間,在這分享一下:
這裡用鏈式結構,非順序表結構;
檔案壓縮:
1.獲取檔案資訊(這裡採用TXT格式文字);
2.壓縮檔案;
3.寫配置檔案(便於解壓時用,無非就是存放原檔案的索引之類的,比如說,檔案中某個字元出現的個數,記錄下來)
4.解壓縮,使用壓縮後的檔案和配置檔案解壓檔案;
5.用比對軟體,比對解壓後的檔案和原始檔是否相同;
下面慢慢解析:
先看一個檔案資訊類:
此為一個檔案資訊的類結構,包含字元,字元對應出現的次數,以及這個字元對應的哈夫曼編碼(能看到這篇部落格的星弟,對哈夫曼編碼不會陌生,這裡不再強調)typedef long long LongType; struct FileInfo { unsigned char _ch; //字元 LongType _count; //字元出現次數 string _code; //字元對應的哈夫曼編碼 FileInfo(unsigned char ch = 0) :_ch(ch) ,_count(0) {} FileInfo operator+(const FileInfo& x) { FileInfo tmp; tmp._count = this->_count + x._count; return tmp; } bool operator !=(const FileInfo& x) const { return this->_count != x._count; } }; bool operator<(const FileInfo info1,const FileInfo info2) { return info1._count < info2._count; }
除了統計字元出現的次數及哈夫曼編碼,還完成了幾個運算子的過載
要獲取哈夫曼編碼,就得建立哈夫曼樹,建立哈夫曼樹用最小堆取操作,以下是最小堆建立過程
最小堆裡也完成了很多介面,包括push pop等// 小堆 template<class T> struct Less { bool operator() (const T& l, const T& r) { return l < r; // operator< } }; template<class T> struct Greater { bool operator() (const T& l, const T& r) { return l > r; // operator< } }; template<class T, class Compare = Less<T>> class Heap { public: Heap() {} Heap(const T* a, size_t size) { for (size_t i = 0; i < size; ++i) { _arrays.push_back(a[i]); } // 建堆 for(int i = (_arrays.size()-2)/2; i >= 0; --i) { AdjustDown(i); } } void Push(const T& x) { _arrays.push_back(x); AdjustUp(_arrays.size()-1); } void Pop() { assert(_arrays.size() > 0); swap(_arrays[0], _arrays[_arrays.size() - 1]); _arrays.pop_back(); AdjustDown(0); } T& Top() { assert(_arrays.size() > 0); return _arrays[0]; } bool Empty() { return _arrays.empty(); } int Size() { return _arrays.size(); } void AdjustDown(int root) { int child = root*2 + 1; // Compare com; while (child < _arrays.size()) { // 比較出左右孩子中小的那個 if (child+1<_arrays.size() && *_arrays[child+1] < _arrays[child]) //if(child+1<_arrays.size() && // com(_arrays[child+1],_arrays[child])) { ++child; } if(*_arrays[child] < _arrays[root]) //if(com(_arrays[child],_arrays[root])) { swap(_arrays[child], _arrays[root]); root = child; child = 2*root+1; } else { break; } } } void AdjustUp(int child) { int parent = (child-1)/2; //while (parent >= 0) while (child > 0) { if (*_arrays[child] < _arrays[parent]) { swap(_arrays[parent], _arrays[child]); child = parent; parent = (child-1)/2; } else { break; } } } public: vector<T> _arrays; };
然後就是幾個壓縮和解壓的函式介面
1.根據哈夫曼樹獲取哈夫曼變慢:
2.根據最小堆建立哈夫曼樹;void _GenerateHuffmanCode(HuffmanTreeNode<FileInfo>* root) { if (root == nullptr) { return; } _GenerateHuffmanCode(root->_left); _GenerateHuffmanCode(root->_right); //當前節點為葉子節點為空 才生成哈夫曼編碼 if (root->_left == nullptr && root->_right == nullptr) { HuffmanTreeNode<FileInfo>* cur = root; HuffmanTreeNode<FileInfo>* parent = cur->_parent; string& code = _infos[cur->_weight._ch]._code; while (parent) { if (parent->_left == cur) { code += '1'; } else if (parent->_right == cur) { code += '0'; } cur = parent; parent = cur->_parent; } reverse(code.begin(), code.end()); } }
void CreateTree(T *a, size_t size, const T& invalid)
{
assert(a);
Heap<HuffmanTreeNode<T>*> s1; //草 終於發現問題 在這裡 (堆裡放的是指標,型別一定要對)
//找兩個最小的元素
for (size_t i = 0; i < size; ++i)
{
if (a[i] != invalid)
{
HuffmanTreeNode<T>* node = new HuffmanTreeNode<T>(a[i]);
s1.Push(node);
}
}
while (s1.Size() > 1)
{
HuffmanTreeNode<T>* left = s1.Top();
s1.Pop();
HuffmanTreeNode<T>* right = s1.Top();
s1.Pop();
HuffmanTreeNode<T>* parent = new HuffmanTreeNode<T>(left->_weight + right->_weight);
parent->_left = left;
parent->_right = right;
left->_parent = parent;
right->_parent = parent;
s1.Push(parent);
}
_root = s1.Top();
s1.Pop();
}
3.讀取文字檔案中的一行:
bool _ReadLine(FILE *fOutLogFile, string& line)
{
char ch = fgetc(fOutLogFile);
if (feof(fOutLogFile))
return false;
else
{
if (ch == '\n')
{
line += ch;
ch = fgetc(fOutLogFile);
}
while (ch != '\n')
{
line += ch;
ch = fgetc(fOutLogFile);
}
return true;
}
}
4.檔案壓縮
//檔案壓縮
bool Compress(const char* filename)
{
//1.開啟一個檔案,統計檔案字元出現的次數
//2.生成對應的哈弗曼編碼
//3.壓縮檔案
//4.寫配置檔案,方便解壓縮
assert(filename);
FILE *fOut = fopen(filename, "rb");
assert(fOut);
//統計檔案字元出現的次數
unsigned char ch = fgetc(fOut);
while (!feof(fOut)) //檔案結束
{
_infos[ch]._count++;
ch = fgetc(fOut);
}
HuffmanTree<FileInfo> ht;
FileInfo invalid;
ht.CreateTree(_infos, 256, invalid);
//哈夫曼編碼
_GenerateHuffmanCode(ht.GetRoot());
string compressFile = filename;
compressFile += ".huf";
//壓縮後的檔名 字尾為《輸入檔名+.huf》
FILE *finCompress = fopen(compressFile.c_str(), "wb"); //獲取string中的C字串
assert(finCompress);
fseek(fOut, 0, SEEK_SET);//將檔案指標移到開頭
char cha = fgetc(fOut);
unsigned char inch = 0;
int index = 0; //一個位元組的八位
while (!feof(fOut))
{
string& code = _infos[(unsigned char)cha]._code;
for (size_t i = 0; i < code.size(); ++i)
{
inch <<= 1; //低位向高位進
if (code[i] == '1')
{
inch |= 1;
}
if (++index == 8)
{
fputc(inch, finCompress); //夠8位,裝進檔案
index = 0; //重新一輪開始
inch = 0;
}
}
cha = fgetc(fOut);
}
fclose(fOut);
//如果index = 0 說明 上邊8位剛好存滿 不等 下一個自己又出來了
if (index != 0) //處理最後一個字元不夠的問題
{
inch <<= (8 - index); //最高位必須裝上 後邊的浪費掉
fputc(inch, finCompress);
}
fclose(finCompress);
}
5.寫配置檔案:
string logFile = filename;
logFile += ".log";
FILE *Log = fopen(logFile.c_str(), "wb");
assert(Log);
string chInfo;
char str[128] = {0}; //沒空間 不可以
for (size_t i = 1; i < 256; ++i)
{
if (_infos[i]._count > 0)
{
chInfo += _infos[i]._ch;
chInfo += ',';
chInfo += _itoa(_infos[i]._count,str,10);
chInfo += '\n';
fputs(chInfo.c_str(), Log);
chInfo.clear();
}
}
fclose(Log);
6.最後的檔案解壓:
//重構檔案
void _RestoreFiles(HuffmanTreeNode<FileInfo> *root, const char* Fileneme,long long size)
{
assert(root);
//原壓縮檔案
string name = Fileneme;
name += ".huf";
FILE* Out = fopen(name.c_str(),"rb");
assert(Out);
string restorefilename = Fileneme;
restorefilename += ".over";
FILE *over = fopen(restorefilename.c_str(),"wb");
assert(over);
int pos = 8;
long long poss = size;
unsigned char chz = fgetc(Out);
while (poss>0)
{
HuffmanTreeNode<FileInfo>* cur = nullptr;
cur = root;
while (cur->_left != nullptr || cur->_right != nullptr)
{
pos--;
unsigned char temp = chz >> pos;
int ch = 1 & temp;
if (ch == 0)
{
cur = cur->_right;
}
else if (ch == 1)
{
cur = cur->_left;
}
if (pos == 0)
{
chz = fgetc(Out);
pos = 8;
}
}
fputc(cur->_weight._ch, over);
poss--;
}
fclose(Out);
fclose(over);
}
void UnCompress(const char* Fileneme)//解壓縮
{
//1.開啟日誌檔案
//2.根據資訊還原哈夫曼樹
//3.還原資訊;
string UnCompressneme = Fileneme;
UnCompressneme += ".log";
FILE *fOutLogFile = fopen(UnCompressneme.c_str(), "rb");
assert(fOutLogFile);
string line;
while (_ReadLine(fOutLogFile, line))
{
unsigned char ch = line[0];
_infos[ch]._count = atoi(line.substr(2).c_str());
line.clear();
}
HuffmanTree<FileInfo> f;
FileInfo invalid;
f.CreateTree(_infos, 256, invalid);
//根據重建的哈夫曼樹 還原檔案;
long long size = f.GetRoot()->_weight._count;
_RestoreFiles(f.GetRoot(), Fileneme,size);
}
到此,此專案基本完成;如遇問題,希望留言,隨時解答,如有見解,跪求賜教!