1. 程式人生 > >C++讀寫檔案,處理UTF8檔案,處理GBK中文字元

C++讀寫檔案,處理UTF8檔案,處理GBK中文字元

讀檔案

	//從檔案中提取詞典
	void getLexiconFromTrainData(char* filepath){
		maxLength = 0;
		lexicalItemCount=0;
		allSentenceCount=0;
		wordCount=0;


		ifstream infile;
		char a;
		string line;
		string word;



		infile.open(filepath);//開啟檔案
		if(!infile){
			cerr<<"error:unable to open input file: "<<infile<<endl;
		}

		//每次讀取一個字元進行處理
		do{
		infile.get(a);
		if(infile.eof())
		break;
		cout<<a;
		getchar();
		}while(!infile.eof());
		

		//每次讀取一行進行處理,行與行之間以回車換行分隔
		while(getline(infile,line)){
		allSentence.push_back(line);//每一行都儲存到vector中
		//cout<<line;
		}
		cout<<"檔案讀取完畢"<<endl;
		cout<<"vector長度"<<allSentence.size()<<endl;
		


		//每次讀一個詞進行處理,詞和詞之間用空格分開
		while( infile> >word ){    
			//cout << "Read from file: "<< word<< endl;
			wordCount++;
			lexicalItem.insert(word);
			if(word.size()>maxLength){
				maxLength = word.size();
				cout<<"迄今為止,最長的詞"<<word<<",長度為:"<<word.size()/2<<endl;
			}
			//getchar();
		}
		maxLength = maxLength;
		infile.close();

		cout<<"總詞彙量:"<<wordCount<<endl;
		cout<<"詞典詞數:"<<lexicalItem.size()<<endl;
	}

寫檔案

//把所有的句子儲存迴文件
void FileIOfunc::saveAllSentenceToFile(char* filepath,vector<vector<string> > resultSentence){

		ofstream outfile;
		stringstream ss;

		//outfile.open(filepath,ios::app);//以追加方式寫檔案
		outfile.open(filepath);//以覆蓋方式寫檔案

		//把所有的句子儲存到檔案
		for(vector<vector<string> >::iterator oneSentence = resultSentence.begin();oneSentence!=resultSentence.end();oneSentence++){
			for(vector<string>::iterator oneWord = (*oneSentence).begin();oneWord!=(*oneSentence).end();oneWord++){
				ss<<*oneWord<<" ";
			}
			ss<<endl;
		}
		outfile<<ss.str();
		cout<<"所有的句子儲存迴文件完成"<<endl;

		outfile.clear();
		outfile.close();
	}
處理UTF8檔案
//處理utf8編碼檔案的函式,判斷取字串的偏移量
int utf8_char_len(char firstByte)
{
	const unsigned char kFirstBitMask = 128; // 1000000
	const unsigned char kSecondBitMask = 64; // 0100000
	const unsigned char kThirdBitMask = 32; // 0010000
	const unsigned char kFourthBitMask = 16; // 0001000
	const unsigned char kFifthBitMask = 8; // 0000100
	std::string::difference_type offset = 1;
 
	if(firstByte & kFirstBitMask) // This means the first byte has a value greater than 127, and so is beyond the ASCII range.
		{
			if(firstByte & kThirdBitMask) // This means that the first byte has a value greater than 224, and so it must be at least a three-octet code point.
			{
				if(firstByte & kFourthBitMask) // This means that the first byte has a value greater than 240, and so it must be a four-octet code point.
					{
						offset = 4;
					}else{
						offset = 3;
						}
			}else{ 
				offset = 2;
					} 
			} 
return offset;
}

處理GBK中文字元

//判斷一個字元是否是漢字,是則返回1,否則返回0,處理GBK檔案,GBK中漢字是兩個位元組
int SentenceEncoder::isChineseCharacter(string str){

	char c = str.c_str()[0];
	if(c&0x80){//如果字元高位是1,則是漢字,預設是UTF8編碼
		return 1;
	}else{
		return 0;
	}  

}