1. 程式人生 > >基於哈夫曼樹的檔案壓縮

基於哈夫曼樹的檔案壓縮

基本思想:
壓縮:
1、統計出檔案中相同字元出現的次數
2、獲取哈夫曼編碼
次數作為權值構建哈夫曼樹
3、重新編碼,寫回壓縮檔案
儲存標頭檔案:
原始檔字尾
編碼資訊的行數
每個字元的權
儲存編碼

解壓縮:
1、獲取原檔案字尾
2、獲取每個字元出現的次數,即權值
3、利用之前後的的權值,還原哈夫曼樹
4、找到對應的葉子節點,將資訊儲存到解壓檔案中

在寫壓縮檔案之前,首先需要實現堆和哈夫曼樹

1,建堆

#include<iostream>
#include<vector>
using namespace std
; //利用仿函式的特性實現程式碼的複用性 template<class T> struct Small { bool operator()(const T& l, const T& r) { return l < r; } }; template<class T> struct Large { bool operator()(const T& l, const T& r) { return l > r; } }; template<class
T, class Compare = Large<T>> //預設是建小堆 class Heap { public: Heap() {} Heap(const T *a, int size) { assert(a); _a.reserve(size); for (int i = 0; i<size; ++i) { _a.push_back(a[i]); } //建堆的時候從倒數第一個非葉子結點開始. for (int
j = (size - 2) / 2; j >= 0; --j) { adjust_down(j); } } void Push(const T& x) { _a.push_back(x); adjust_up(_a.size() - 1); } void Pop() { assert(!_a.empty()); swap(_a[0], _a[_a.size() - 1]); _a.pop_back(); adjust_down(0); } size_t Size() { return _a.size(); } bool Empty() { return _a.empty(); } const T& Top()const { assert(!_a.empty()); return _a[0]; } void Display() { for (size_t i = 0; i<_a.size(); ++i) { cout << _a[i] << " "; } cout << endl; } void adjust_down(int root) { Compare com; int parent = root; int child = parent * 2 + 1;//parent的左孩子 while (child < _a.size()) { /*if rightchild > leftchild,child->right while 裡面我們已經可以確定child(左孩子下標一定小於size 但是我們不能保證右孩子的下標小於size,所以if語句裡我們 要判斷一下,以免訪問越界) */ if (child + 1<_a.size() && com(_a[child + 1], _a[child])) //if (child + 1<_a.size() && _a[child + 1] > _a[child]) { ++child; } if (com(_a[child], _a[parent]))//如果是>則為大堆 //if (_a[child] > _a[parent])//if child>parent,swap { swap(_a[child], _a[parent]); parent = child; //讓parent指向child,繼續向下調整 child = child * 2 + 1; } else break; } } void adjust_up(int child) { Compare com; size_t parent = (child - 1) >> 1; while (child > 0) { if (com(_a[child], _a[parent])) // if (_a[child] > _a[parent]) { swap(_a[child], _a[parent]); child = parent; parent = (child - 1) >> 1; } break; } } protected: vector<T> _a; };

2,構建哈弗曼樹

#include "heap.h"

template<class T>
struct HuffmanTreeNode
{
    T _weight;
    HuffmanTreeNode<T> *_left;
    HuffmanTreeNode<T> *_right;
    HuffmanTreeNode<T> *_parent;
    HuffmanTreeNode(const T& w = T())
        :_weight(w)          //權值
        , _left(NULL)
        , _right(NULL)
        , _parent(NULL)
    {}
};

template<class T>
class HuffmanTree
{
    typedef HuffmanTreeNode<T> Node;
public:
    HuffmanTree()
        :_root(NULL)
    {}
    HuffmanTree(const T* a, size_t size)
        :_root(NULL)
    {
        //定義一個內部類
        struct NodeLess
        {
            bool operator()(Node *l, Node *r)const
            {
                return l->_weight < r->_weight;
            }
        };
        Heap<Node *, NodeLess> minHeap;
        //建立結點並放入vector中
        for (size_t i = 0; i<size; ++i)
        {
            Node *tmp = new Node(a[i]);
            minHeap.Push(tmp);
        }
        //取出較小的兩個結點作為左右孩子並構建父結點
        while (minHeap.Size() > 1)
        {
            Node *left = minHeap.Top();
            minHeap.Pop();
            Node *right = minHeap.Top();
            minHeap.Pop();
            Node *parent = new Node(left->_weight + right->_weight);
            parent->_left = left;
            parent->_right = right;
            left-> = p_parentarent;
            right->_parent = parent;
            minHeap.Push(parent);
        }
        _root = minHeap.Top();
    }

    HuffmanTree(const T* a, size_t size, const T& invalid)
    {

        struct NodeLess
        {
            bool operator()(Node *l, Node *r)const
            {
                return l->_weight < r->_weight;
            }
        };
        Heap<Node *, NodeLess> minHeap;
        //建立結點並放入vector中
        for (size_t i = 0; i<size; ++i)
        {
            if (a[i] != invalid)
            {
                Node *tmp = new Node(a[i]);
                minHeap.Push(tmp);
            }
        }
        //取出較小的兩個結點作為左右孩子並構建父結點
        while (minHeap.Size() > 1)
        {
            Node *left = minHeap.Top();
            minHeap.Pop();
            Node *right = minHeap.Top();
            minHeap.Pop();
            Node *parent = new Node(left->_weight + right->_weight);
            parent->_left = left;
            parent->_right = right;
            left->_parent = parent;
            right->_parent = parent;
            minHeap.Push(parent);
        }
        _root = minHeap.Top();
    }
    Node* GetRoot()
    {
        return _root;
    }

    void Destroy(Node* &root)
    {
        if (root == NULL)
            return;
        Destroy(root->_left);
        Destroy(root->_rihgt);
        delete root;
        root = NULL:
        return;
    }
protected:
    Node *_root;
};

3,生產哈夫曼編碼,並進行壓縮和解壓縮

#include<string>
#include<Windows.h>
#include<assert.h>

#include "huffman_tree.h"
using namespace std;


typedef long long Type;
struct CharInfo
{
    unsigned char _ch;     //出現的字元
    Type _count;           //統計次數
    string _code;          //Huffman編碼
    CharInfo(Type count = 0)
        :_ch(0)
        , _count(count)
        , _code("")
    {}
    //過載對應的操作符
    CharInfo operator + (const CharInfo& fc)const
    {
        return CharInfo(_count + fc._count);
    }
    bool operator != (const CharInfo fc)const
    {
        return _count != fc._count;
    }
    bool operator < (const CharInfo& fc)const
    {
        return _count < fc._count;
    }
};

class FileCompress
{
protected:
    CharInfo _infos[256];
public:
    //預設的建構函式
    FileCompress()
    {
        for (size_t i = 0; i<256; ++i)
        {
            _infos[i]._ch = i;
        }
    }

    //生成Huffman_code函式
    void GenerateHufffmanCode(HuffmanTreeNode<CharInfo> * root, string code)
    {
        if (root == NULL)return;
        if (root->_left == NULL&&root->_right == NULL)//葉子節點
        {
            _infos[root->_weight._ch]._code = code;
            return;
        }
        GenerateHufffmanCode(root->_left, code + '0');
        GenerateHufffmanCode(root->_right, code + '1');
    }
    string Compress(const char *filename)
    {
        assert(filename);
        FILE *pf = fopen(filename, "rb");
        assert(pf);
        //fgetc函式的作用是意為從檔案指標stream指向的檔案中讀取一個字元,讀取一個位元組後,游標位置後移一個位元組,返回值為他所讀到的字元,因為返回值要能表示-1,所以返回值型別是int
        unsigned char ch = fgetc(pf);
        //統計字元出現的次數
        while (!feof(pf))//feof檢測檔案流上的結束標誌
        {
            _infos[ch]._count++;
            ch = fgetc(pf);
        }
        //以該字元出現的次數構建一顆HuffmanTree.
        CharInfo invalid;   //非法值
        HuffmanTree<CharInfo> ht(_infos, 256, invalid);
        //生成Huffman編碼
        string code;
        GenerateHufffmanCode(ht.GetRoot(), code);
        //壓縮檔案
        fseek(pf, 0, SEEK_SET);          //回到檔案頭
        string compressfile = filename;
        compressfile += ".compress";   //壓縮後的檔名
        FILE *fin = fopen(compressfile.c_str(), "wb");
        assert(fin);
        size_t pos = 0;                  //記錄位數
        unsigned char value = 0;
        ch = fgetc(pf);
        while (!feof(pf))
        {
            string &code = _infos[ch]._code;
            for (size_t i = 0; i<code.size(); ++i)
            {
                value <<= 1;
                if (code[i] == '1')
                    value |= 1;
                else
                    value |= 0;    //do-nothing
                ++pos;
                if (pos == 8)     //滿一個位元組
                {
                    fputc(value, fin);
                    value = 0;
                    pos = 0;
                }
            }
            ch = fgetc(pf);
        }
        if (pos)      //解決不足8位的情況.
        {
            value <<= (8 - pos);
            fputc(value, fin);
        }
        //配置檔案--便於重建Huffman樹
        string configfilename = filename;
        configfilename += ".config";
        FILE *finconfig = fopen(configfilename.c_str(), "wb");
        assert(finconfig);
        string line;
        char buff[128];
        for (size_t i = 0; i<256; ++i)
        {
            //一行一行的讀
            if (_infos[i]._count)
            {
                line += _infos[i]._ch;
                line += ",";
                line += _itoa(_infos[i]._count, buff, 10);
                line += "\n";
                //fputs(line.c_str(),finconfig);
                fwrite(line.c_str(), 1, line.size(), finconfig);
                line.clear();
            }
        }
        fclose(pf);
        fclose(fin);
        fclose(finconfig);
        return compressfile;
    }
    string UnCompress(const char *filename)
    {
        assert(filename);
        string configfilename = filename;
        size_t index = configfilename.rfind(".");
        configfilename = configfilename.substr(0, index);
        configfilename += ".config";
        FILE *foutconfig = fopen(configfilename.c_str(), "rb");
        assert(foutconfig);
        string line;
        //讀取配置檔案--獲取字元出現的次數
        unsigned char ch = 0;
        while (ReadLine(foutconfig, line))
        {
            if (line.empty())
            {
                line += '\n';
                continue;
            }
            //讀到空行
            ch = line[0];
            _infos[ch]._count = atoi(line.substr(2).c_str());
            line.clear();
        }
        //構建Huffman樹
        CharInfo invalid;
        HuffmanTree<CharInfo> hft(_infos, 256, invalid);
        //根結點的權值也就是字元出現的次數總和
        HuffmanTreeNode<CharInfo> *root = hft.GetRoot();
        Type charcount = root->_weight._count;
        //解壓縮
        string uncompressfilename = filename;
        index = uncompressfilename.rfind(".");
        uncompressfilename = uncompressfilename.substr(0, index);
        uncompressfilename += ".uncompress";
        FILE *fin = fopen(uncompressfilename.c_str(), "wb");
        assert(fin);
        //由壓縮檔案還原檔案
        string compressfilename = filename;
        FILE *fout = fopen(compressfilename.c_str(), "rb");
        assert(fout);

        HuffmanTreeNode<CharInfo> *cur = root;
        int pos = 7;
        ch = fgetc(fout);
        while (charcount > 0)
        {
            while (cur)
            {
                if (cur->_left == NULL && cur->_right == NULL)
                {
                    //葉子結點
                    fputc(cur->_weight._ch, fin);
                    cur = root;
                    --charcount;
                    if (charcount == 0)   //所有的字元都處理完成
                        break;
                }
                if (ch & (1 << pos))    //檢查字元的每個位
                    cur = cur->_right;    //1向右走
                else
                    cur = cur->_left;     //0向左走
                --pos;
                if (pos < 0)             //一個位元組解壓完成
                {
                    ch = fgetc(fout);
                    pos = 7;
                }
            }
        }
        fclose(foutconfig);
        fclose(fin);
        fclose(fout);
        return uncompressfilename;
    }
    //讀取一行字元並放在line中
    bool ReadLine(FILE *fout, string& line)
    {
        int ch = fgetc(fout);
        if (ch == EOF)
            return false;
        while (ch != EOF && ch != '\n')
        {
            line += ch;
            ch = fgetc(fout);
        }
        return true;
    }
};

4,測試

#include"huffman_code.h"



void testFileCompress()
{
    FileCompress fc;
    fc.Compress("1.png");
    fc.UnCompress("1.png.compress");
}

int main()
{
    //testFileCompress1();
    testFileCompress();

    system("pause");
    return 0;
}