1. 程式人生 > >Huffman編碼實現壓縮、解壓檔案

Huffman編碼實現壓縮、解壓檔案

Huffman編碼:根據詞頻構建Huffman樹,實現對文字的字首編碼。

1、統計文字中每個字元出現的次數,放入優先佇列中,構建一棵空的二叉樹

2、取出頻率最小的兩個字元a、b,字元a、b的頻率分別作為此二叉樹的左右結點,左結點的編號為1,右結點的編號為0,其頻率之和(fa + fb)作為該二叉樹的父親節點,放入優先佇列,並將f 、fb 從優先佇列中除去;

3、重複第二步操作,直至優先佇列中只剩下一個數,即為此Huffman樹的根節點。

4、從根節點到每個葉節點(文字中出現的字元)的“路徑”,即0、1序列串就是該字元的字首編碼。

注:這種編碼方式保證了,任意一個字元的編碼都不會是其他字元編碼的字首,這樣在解碼過程中就不會混淆。

資料結構:

為方便記錄每個字元的字首編碼,在構建Huffman樹過程中,需要儲存每一個結點的父親節點、左右兒子結點、葉節點對應字元、當前結點頻率。

壓縮過程:

1、首先構建Huffman樹,獲得每個字元對應的字首編碼;

2、將字元及其對應的字首編碼等壓縮資訊寫入壓縮文件中,便於解碼;

3、掃描文字,將文字中的字元轉換成0、1串,每八位,即一個位元組對應的字元儲存到壓縮檔案中。

注:如果最後儲存的0、1串不足八位,則在末尾補0,然後將補的位數資訊寫入壓縮檔案中。

解壓過程:

1、讀取壓縮資訊;

2、掃描壓縮文字,將每個字元轉化成0、1串,匹配字元的字首編碼,轉化成原始檔案。

注:解碼時需刪除之前補充的位數

一點體會:

1、總在迴圈內,動態申請陣列,會導致程式崩潰;

2、千萬不要在迴圈內,每次都呼叫strlen函式,我表示沒能深入瞭解此函式內涵,導致程式慢的要死;

3、原文字越大,壓縮率越高,對於一個2M的檔案,壓縮率大約在45%左右;

4、感謝領導傾情指點,比賽加油!

壓縮過程程式原始碼:

#include <iostream>
#include <fstream>
#include <cstring>
#include <queue>
#include <algorithm>
#include <time.h>
using namespace std;

typedef long long LL;
const int FILE_LENGTH = 1000;
//maximal bytes which is read from file each time
const long long MAX_MEMORY = 3 * 1024 * 1024;
//number of kinds of character
const int KIND_OF_CHARACTER = 260;
//the maximal length of Huffman code
const int HUFFMAN_CODE_LENGTH = 1000;
//the position of the size of original file in compressed file
const int OFFSET = 20;
//store compress file in 8 bits
const int nBits = 8;

struct Node {
    char c; // character
    int parent, lChild, rChild;//children node
    int iNode; //the serial number of node
    LL number; //number of corresponding character
    friend bool operator < (Node a, Node b) {
        return a.number > b.number;
    }
}node[KIND_OF_CHARACTER];

char HuffmanCode[KIND_OF_CHARACTER][HUFFMAN_CODE_LENGTH];
//LL characters[KIND_OF_CHARACTER];

void CountKinds(); //for test
int BuildHuffmanTree();
void CompressFile(const char *filePath, const char *outPutFilePath, int numberOfNode);
void BitToInt(ofstream &outPut, char *HTstr, LL len);


int main() {

    //scan the file to count frequency of each character.
    char filePath[FILE_LENGTH] = "graph.txt";   //"Aesop_Fables.txt"; "graph.txt";  "1.txt";
    char compressFilePath[FILE_LENGTH] = "result.txt";

    ifstream readIn;
    readIn.open(filePath, ios::binary);
    if (readIn.is_open() == 0) {
        cout << "OPEN FAILED!" << endl;
        exit(0);
    }
     //get size of file
    readIn.seekg(0, ios::end);
    LL fileSize = (LL)readIn.tellg();
    readIn.seekg(0, ios::beg);
    cout<<"fileSize" <<fileSize<<endl;
    //read data in batches, each time read MAX_MEMORY characters
    int nTimes = (int)(fileSize / MAX_MEMORY);
    if (fileSize % MAX_MEMORY != 0) nTimes++;
    int kindsOfCharacter = 0;
    cout<<nTimes<<endl;
    for (int i = 1; i <= nTimes; i++) {
        char *str = (char *)calloc(1, (MAX_MEMORY+10)*sizeof(char));

        LL numberOfCharacter = MAX_MEMORY;
        if (i == nTimes) {
            numberOfCharacter = fileSize % MAX_MEMORY;
        }
        readIn.read(str, numberOfCharacter * sizeof(char));
        str[numberOfCharacter] = '\0';
        cout<<strlen(str)<<endl;

        //count the frequency of each character.
        int lenStr = strlen(str);
        for (LL j = 0; j < lenStr; j++) {
            node[str[j]].number++;
            node[str[j]].c = str[j];
        }

        free(str);
    }
   // CountKinds();

    //build Huffman tree
    int numberOfNode = BuildHuffmanTree();

    //compress file using Huffman code
    CompressFile(filePath, compressFilePath, numberOfNode);

    //outPut.close();
   // readIn.close();
}

int BuildHuffmanTree(){
    //apply 2 * KIND_OF_CHARACTER to store nodes of the Huffman tree
    Node* HT = (Node *)malloc((2 * KIND_OF_CHARACTER) * sizeof(Node));
    //put all kinds of character into priority queue
    priority_queue<Node> q;
    int  numberOfNode = 0;
    for (int i = 0; i < KIND_OF_CHARACTER; i++) {
        if (node[i].number != 0) {
            node[i].iNode = numberOfNode;
            node[i].c = i;
            q.push(node[i]);
            HT[numberOfNode] = node[i];
            numberOfNode++;
        }
    }
    cout << numberOfNode << endl;
    int jNode = numberOfNode;
    while (q.size() > 1){
        //get two minimal weight nodes and set their parent
        Node leftNode = q.top();
        q.pop();
        Node rightNode = q.top();
        q.pop();
        //cout <<" ##"<< leftNode.number <<endl;
        //cout <<" **"<< rightNode.number <<endl;
        int l = leftNode.iNode;
        int r = rightNode.iNode;
        HT[l].parent = jNode;
        HT[r].parent = jNode;
        //set parent's  information
        HT[jNode].c = ' ';
        HT[jNode].iNode = jNode;
        HT[jNode].lChild = l;
        HT[jNode].rChild = r;
        HT[jNode].number = leftNode.number + rightNode.number;
        q.push(HT[jNode]);
        jNode++;
    }
    HT[jNode-1].parent = -1;
   /* for (int i = 0; i < jNode; i++){
        cout << i << " " << HT[i].c <<  " " << HT[i].number<< endl;
    }*/
    //get each character's Huffman code
    for (int i = 0; i < numberOfNode; i++) {
        int k = 0;
        int l = i;
        char ch = HT[i].c;
        for (int j = HT[i].parent; j != -1; j = HT[j].parent) {
            if (HT[j].lChild == l) {
                HuffmanCode[ch][k] = '0';
            }
            else {
                HuffmanCode[ch][k] = '1';
            }
            l = j;
            k++;
        }
        //reverse the Huffman code
        for (int j = 0; j < k / 2; j++) {
            char temp = HuffmanCode[ch][j];
            HuffmanCode[ch][j] = HuffmanCode[ch][k-1-j];
            HuffmanCode[ch][k-1-j] = temp;
        }
        HuffmanCode[ch][k] = '\0';
        cout << ch << " " <<HuffmanCode[ch] << endl;

    }
    cout<<numberOfNode<<endl;
    free(HT);
    return numberOfNode;
}

void CompressFile(const char *filePath, const char *outPutFilePath, int numberOfNode){
    //scan characters in input file once more
    ifstream readIn;
    readIn.open(filePath, ios::binary);
    if (readIn.is_open() == 0) {
        cout << "OPEN FAILED!" << endl;
        exit(0);
    }

    //write Huffman code file
    //Information: number of bits added, OFFSET, size of original file. the number of kinds of character
    ofstream outPut;
    outPut.open(outPutFilePath, ios::binary);
    if (outPut.is_open() == 0) {
        cout << "OPEN FAILED!" << endl;
        exit(0);
    }

    //get size of file
    readIn.seekg(0, ios::end);
    LL fileSize = (LL)readIn.tellg();
    readIn.seekg(0, ios::beg);
    //write some information in compressed file

    outPut.seekp(OFFSET, ios::beg);
    outPut.write((char *)&fileSize, sizeof(LL));
    outPut.write((char *)&numberOfNode, sizeof(int));
    //record the character and its Huffman code
    for (int i = 0; i < KIND_OF_CHARACTER; i++) {
        if (node[i].number != 0) {
            outPut.write((char *)&i, sizeof(char));
            int bits = strlen(HuffmanCode[i]);
            outPut.write((char *)&bits, sizeof(int));
            outPut.write((char *)&HuffmanCode[i], bits*sizeof(char));
        }
    }


    //read data in batches, each time read MAX_MEMORY characters and encode
    int nTimes = (int)(fileSize / MAX_MEMORY);
    if (fileSize % MAX_MEMORY != 0) nTimes++;
    int kindsOfCharacter = 0;
    char *HTstr = (char *)calloc(1, (MAX_MEMORY+HUFFMAN_CODE_LENGTH)*sizeof(char));
    int len = 0;
    LL lenT = 0;
    for (int i = 1; i <= nTimes; i++) {
        char *str = (char *)calloc(1, (MAX_MEMORY+10)*sizeof(char));
        LL numberOfCharacter = MAX_MEMORY;
        if (i == nTimes) {
            numberOfCharacter = fileSize % MAX_MEMORY;
        }
        readIn.read(str, numberOfCharacter * sizeof(char));
        str[numberOfCharacter] = '\0';
        for (LL j = 0; j < numberOfCharacter; j++) {
            char ch = str[j];
            lenT += strlen(HuffmanCode[ch]);
            strcpy(HTstr+len, HuffmanCode[ch]);
            len += strlen(HuffmanCode[ch]);

            //write compressed file in batches
            //when the length of encode string is greater than limited memory
            if (len > MAX_MEMORY) {
               // cout<<"****"<<endl;
                LL leftBits = len % nBits;
                LL changeLength = len - leftBits;
                BitToInt(outPut, HTstr, changeLength);

                //if no left bits, no need to keep it.
                strcpy(HTstr,  HTstr+changeLength);
                len = strlen(HTstr);
            }
        }
        free(str);
    }
    //cout<<strlen(HTstr)<<" "<<HTstr<<endl;
    //if there are left bits, change int integer
    if (len != 0) {
        BitToInt(outPut, HTstr, len);
        //store tail???
    }
    free(HTstr);
    readIn.close();
    outPut.close();
}

void BitToInt(ofstream &outPut, char* HTstr, LL len) {
    //add 0 to make the length of HTstr can be divide by 7
    int k = 0;
    if (len % nBits != 0) {
        int bitsToAdd = nBits - (len % nBits);
        streampos pos = outPut.tellp();
        outPut.seekp(0, ios::beg);
        outPut.write((char *)&bitsToAdd, sizeof(int));
        outPut.write((char *)&OFFSET, sizeof(int));
        outPut.seekp(pos, ios::beg);
        for (; k < bitsToAdd; k++){
            HTstr[len+k] = '0';
        }
        HTstr[len+k] = '\0';

    }
    //char *buf = (char *)calloc(1, MAX_MEMORY * sizeof(char));
    //convert bit to char
    int pow = 1<<(nBits - 1);
    int sum = 0;
    for (LL i = 0, j = 0; i < len+k && HTstr[i]; i++) {
        if (j == nBits){
            outPut.write((char *)&sum, sizeof(char));

            j = 0;
            sum = 0;
        }
        sum = sum + (HTstr[i]-'0') * (pow >> j);
        j++;
    }

   // outPut.write(buf, strlen(buf) * sizeof(char));
    outPut.write((char *)&sum, sizeof(char));
   // free(buf);
   // cout <<sum <<endl;
}


void CountKinds(){
    int kinds = 0;
    for (int i = 0; i < KIND_OF_CHARACTER; i++) {
        if (node[i].number != 0) {
                printf("%c ", node[i].c);
            cout << node[i].c << " " << node[i].number<<endl;
            kinds++;
        }
    }
    cout << kinds << endl; //76
}
解壓過程程式原始碼:
#include <iostream>
#include <fstream>
#include <algorithm>
#include <cstring>
using namespace std;

typedef long long  LL;
const int FILE_LENGTH = 1000;
//the maximal length of Huffman code
const int HUFFMAN_CODE_LENGTH = 1000;
//number of kinds of character
const int KIND_OF_CHARACTER = 256;
//maximal bytes which is read from file each time
const long long MAX_MEMORY = 1 * 1024 * 1024;


struct Node {
	char c; //character
	char Huffmancode[HUFFMAN_CODE_LENGTH]; //bits string
}node[KIND_OF_CHARACTER]; //encoding information

//store each nBits
int  nBits = 8;
LL originalFileSize; //the size of original file
int numberOfNode;   //number of kind of character
int bitsAdded;
int OFFSET;

int GetCompressInformation(ifstream &readIn);
void DecompressFile(ifstream &readIn, ofstream &writeOut, int maxEncodingLength);

int main() {
	char compressFilePath[FILE_LENGTH] = "result.txt"; //graph.txt  "1.txt";
	char decompressFilePath[FILE_LENGTH] = "decompressResult.txt";
	ifstream readIn;
	readIn.open(compressFilePath, ios::binary);
	if (readIn.is_open() == 0) {
		cout << "OPEN FAILED!" << endl;
		exit(0);
	}
	ofstream writeOut;
	writeOut.open(decompressFilePath, ios::binary);
	if (writeOut.is_open() == 0) {
		cout << "OPEN FAILED!" << endl;
		exit(0);
	}
	//get information of compressed file
	int maxEncodingLength = GetCompressInformation(readIn);
	//decompress File
	DecompressFile(readIn, writeOut, maxEncodingLength);
	readIn.close();
	writeOut.close();
	return 0;
}

int GetCompressInformation(ifstream &readIn){
	readIn.read((char *)&bitsAdded, sizeof(int));
	readIn.read((char *)&OFFSET, sizeof(int));
	readIn.seekg(OFFSET, ios::beg);
	readIn.read((char *)&originalFileSize, sizeof(LL));
	readIn.read((char *)&numberOfNode, sizeof(int));
	cout << originalFileSize << " " << numberOfNode << endl;
	//record the character and its Huffman code
	int maxEncodingLength = 0;
	for (int i = 0; i < numberOfNode; i++) {
		readIn.read((char *)&node[i].c, sizeof(char));
		int bits;
		readIn.read((char *)&bits, sizeof(int));
		readIn.read((char *)&node[i].Huffmancode, bits*sizeof(char));
		node[i].Huffmancode[bits] = '\0';
		cout << node[i].c << " " << node[i].Huffmancode << endl;
		if (maxEncodingLength < strlen(node[i].Huffmancode)) {
			maxEncodingLength = strlen(node[i].Huffmancode);
		}
	}
	cout << " maxEncodingLength :" << maxEncodingLength << endl;
	return maxEncodingLength;
}

void DecompressFile(ifstream &readIn, ofstream &writeOut, int maxEncodingLength){
	//get size of compressed file
	streampos curPos = readIn.tellg();
	readIn.seekg(0, ios::end);
	LL compressedFileSize = (LL)(readIn.tellg() - curPos);
	readIn.seekg(curPos, ios::beg);
	cout << "size of compressed file : " << compressedFileSize << endl;
	//read data in batches, each time read MAX_MEMORY characters
	int nTimes = (int)(compressedFileSize / MAX_MEMORY);
	if (compressedFileSize % MAX_MEMORY != 0) nTimes++;
	char *str = (char *)calloc(1, (MAX_MEMORY + HUFFMAN_CODE_LENGTH)* sizeof(char));
	int lenOfChar = 0;
	for (int j = 1; j <= nTimes; j++) {
		LL numberOfCharacter = MAX_MEMORY;
		if (j == nTimes) {
			numberOfCharacter = compressedFileSize % MAX_MEMORY;
		}
		char *strTemp = (char *)calloc(1, (2*HUFFMAN_CODE_LENGTH) * sizeof(char));
		char *buf = (char *)calloc(1, (MAX_MEMORY + HUFFMAN_CODE_LENGTH)* sizeof(char));

		readIn.read(buf, numberOfCharacter * sizeof(char));
		//cout<<buf<<endl;
		//printf("%d\n", ascII);
        int lenOfStrTemp = 0;
		for (int k = 0; k < numberOfCharacter; k++) {
			// convert it to binary bits
			unsigned char ascII = buf[k];
			char huffmanString[3*nBits];

			for (int i = nBits - 1; i >= 0; i--) {
				huffmanString[i] = ascII % 2 + '0';
				ascII = ascII / 2;
			}
			//if read last character, then minus bits which is added
			if ((j == nTimes) && (k == numberOfCharacter - 1)) {
               // printf("ascII:%d\n", ascII);
				nBits = nBits - bitsAdded;
			}
			huffmanString[nBits] = '\0';

			// cout<<huffmanString<<endl;
			strcpy(strTemp + lenOfStrTemp, huffmanString);
			lenOfStrTemp += strlen(huffmanString);

			//convert bit to char
			LL comparePosition = 0;
			while (1) {
				bool flag = false;
				for (int z = 0; z < numberOfNode; z++) {
					//if(strlen(node[z].Huffmancode) > strlen(strcmp)) continue;
					int lenHuffmanCode = strlen(node[z].Huffmancode);
					if (!memcmp(node[z].Huffmancode, strTemp, lenHuffmanCode)) {
						str[lenOfChar] = node[z].c;
						str[lenOfChar+1] = '\0';
						lenOfChar ++;
						//cout<<"strTempF:"<<strTemp<<endl;

                        strcpy(strTemp, strTemp+lenHuffmanCode);
                        lenOfStrTemp = strlen(strTemp);

						//cout<<"strTemp:"<<strTemp<<endl;
						flag = true;
						break;
						//comparePosition += lenHuffmanCode;
					}
				}
				if (!flag || (lenOfStrTemp == 0)) break;
			}


			//if length of str is larger than limited memory, write into decompressed file
			if (lenOfChar > MAX_MEMORY) {
				writeOut.write(str, lenOfChar * sizeof(char));
				//apply a new memory will result in crash
				//free(str);
				//char *str = (char *)calloc(1, (MAX_MEMORY + HUFFMAN_CODE_LENGTH)* sizeof(char));
				strcpy(str, "");
				lenOfChar = 0;
			}
		}
		free(buf);
		free(strTemp);
	}
	//cout<<str<<endl;
	if (lenOfChar != 0){
		writeOut.write(str, lenOfChar * sizeof(char));
		free(str);
	}
}