1. 程式人生 > >C語言實現windows1251編碼轉utf-8編碼

C語言實現windows1251編碼轉utf-8編碼

windows1251是俄羅斯本地的一種編碼,不通用。mini xml好像無法解析,但客戶就是提供一個這種編碼檔案的url,讓你解析裡面的資料。
開源的編碼轉換庫又太大,只好用C語言寫一個。有些字元轉換沒什麼規律,只能一一對應,很耿直的轉換方式,哈哈~~,不過以後如果遇到相同的問題,不用再寫一次,節省時間。
這個程式實現將一個windows1251編碼的檔案轉換成utf-8編碼的檔案。

#include <stdio.h>
#include <string.h>

// 內碼小於0x80的字元轉換成utf-8
void win1251char_utf8_1(unsigned char char_in, unsigned char* out)
{
	out[0] = char_in;
}

// 內碼值在0x80-0xBF之間的字元轉換成utf-8(中間64個字元沒有規律)
void win1251char_utf8_2(unsigned char char_in, unsigned char* out)
{
	switch(char_in)
	{
		case 0x80:
			out[0] = 0xd0;
			out[1] = 0x82;
			break;
		case 0x81:
			out[0] = 0xd0;
			out[1] = 0x83;
			break;
		case 0x82:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0x9a;
			break;
		case 0x83:
			out[0] = 0xd1;
			out[1] = 0x93;
			break;
		case 0x84:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0x9e;
			break;
		case 0x85:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0xa6;
			break;
		case 0x86:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0xa0;
			break;
		case 0x87:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0xa1;
			break;
		case 0x88:
			out[0] = 0xe2;
			out[1] = 0x82;
			out[2] = 0xac;
			break;
		case 0x89:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0xb0;
			break;
		case 0x8a:
			out[0] = 0xd0;
			out[1] = 0x89;
			break;
		case 0x8b:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0xb9;
			break;
		case 0x8c:
			out[0] = 0xd0;
			out[1] = 0x8a;
			break;
		case 0x8d:
			out[0] = 0xd0;
			out[1] = 0x8c;
			break;
		case 0x8e:
			out[0] = 0xd0;
			out[1] = 0x8b;
			break;
		case 0x8f:
			out[0] = 0xd0;
			out[1] = 0x8f;
			break;
		case 0x90:
			out[0] = 0xd1;
			out[1] = 0x92;
			break;
		case 0x91:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0x98;
			break;
		case 0x92:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0x99;
			break;
		case 0x93:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0x9c;
			break;
		case 0x94:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0x9d;
			break;
		case 0x95:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0xa2;
			break;
		case 0x96:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0x93;
			break;
		case 0x97:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0x94;
			break;
		case 0x98:
			// 為空?
			break;
		case 0x99:
			out[0] = 0xe2;
			out[1] = 0x84;
			out[2] = 0xa2;
			break;
		case 0x9a:
			out[0] = 0xd1;
			out[1] = 0x99;
			break;
		case 0x9b:
			out[0] = 0xe2;
			out[1] = 0x80;
			out[2] = 0xba;
			break;
		case 0x9c:
			out[0] = 0xd1;
			out[1] = 0x9a;
			break;
		case 0x9d:
			out[0] = 0xd1;
			out[1] = 0x9c;
			break;
		case 0x9e:
			out[0] = 0xd1;
			out[1] = 0x9b;
			break;
		case 0x9f:
			out[0] = 0xd1;
			out[1] = 0x9f;
			break;
		case 0xa0:
			out[0] = 0xc2;
			out[1] = 0xa0;
			break;
		case 0xa1:
			out[0] = 0xd0;
			out[1] = 0x8e;
			break;
		case 0xa2:
			out[0] = 0xd1;
			out[1] = 0x9e;
			break;
		case 0xa3:
			out[0] = 0xd0;
			out[1] = 0x88;
			break;
		case 0xa4:
			out[0] = 0xc2;
			out[1] = 0xa4;
			break;
		case 0xa5:
			out[0] = 0xd2;
			out[1] = 0x90;
			break;
		case 0xa6:
			out[0] = 0xc2;
			out[1] = 0xa6;
			break;
		case 0xa7:
			out[0] = 0xc2;
			out[1] = 0xa7;
			break;
		case 0xa8:
			out[0] = 0xd0;
			out[1] = 0x81;
			break;
		case 0xa9:
			out[0] = 0xc2;
			out[1] = 0xa9;
			break;
		case 0xaa:
			out[0] = 0xd0;
			out[1] = 0x84;
			break;
		case 0xab:
			out[0] = 0xc2;
			out[1] = 0xab;
			break;
		case 0xac:
			out[0] = 0xc2;
			out[1] = 0xac;
			break;
		case 0xad:
			out[0] = 0xc2;
			out[1] = 0xad;
			break;
		case 0xae:
			out[0] = 0xc2;
			out[1] = 0xae;
			break;
		case 0xaf:
			out[0] = 0xd0;
			out[1] = 0x87;
			break;
		case 0xb0:
			out[0] = 0xc2;
			out[1] = 0xb0;
			break;
		case 0xb1:
			out[0] = 0xc2;
			out[1] = 0xb1;
			break;
		case 0xb2:
			out[0] = 0xd0;
			out[1] = 0x86;
			break;
		case 0xb3:
			out[0] = 0xd1;
			out[1] = 0x96;
			break;
		case 0xb4:
			out[0] = 0xd2;
			out[1] = 0x91;
			break;
		case 0xb5:
			out[0] = 0xc2;
			out[1] = 0xb5;
			break;
		case 0xb6:
			out[0] = 0xc2;
			out[1] = 0xb6;
			break;
		case 0xb7:
			out[0] = 0xc2;
			out[1] = 0xb7;
			break;
		case 0xb8:
			out[0] = 0xd1;
			out[1] = 0x91;
			break;
		case 0xb9:
			out[0] = 0xe2;
			out[1] = 0x84;
			out[2] = 0x96;
			break;
		case 0xba:
			out[0] = 0xd1;
			out[1] = 0x94;
			break;
		case 0xbb:
			out[0] = 0xc2;
			out[1] = 0xbb;
			break;
		case 0xbc:
			out[0] = 0xd1;
			out[1] = 0x98;
			break;
		case 0xbd:
			out[0] = 0xd0;
			out[1] = 0x85;
			break;
		case 0xbe:
			out[0] = 0xd1;
			out[1] = 0x95;
			break;
		case 0xbf:
			out[0] = 0xd1;
			out[1] = 0x97;
			break;
	}
}

// 0xC0-0xFF之間的字元轉換成utf-8
void win1251char_utf8_3(unsigned char char_in, unsigned char* out)
{
	if(char_in <= (unsigned char)0xef)
	{
		out[0] = 0xd0;
		out[1] = char_in - 48;
	}
	else
	{
		out[0] = 0xd1;
		out[1] = char_in - 112;
	}
}

int main()
{
	int read_len = 100, real_len = 0;
	int i = 0, k = 0;
	char buf_in[101] = {0};
	char buf_out[301] = {0};
	char buf[4] = {0};
	char* file_in = "1251.xml";
	char* file_out = "utf8.xml";
	FILE* fp_in = fopen(file_in, "rb");
	FILE* fp_out = fopen(file_out, "w");
	if(fp_in == NULL || fp_out == NULL)
	{
		printf("Open file failed!\n");
		return -1;
	}
	
	while((real_len = fread(buf_in, 1, read_len, fp_in)) != 0)
	{
		if(real_len > read_len) break;
		for(i=0,k=0;i<real_len;i++)
		{
			memset(buf, 0, sizeof(buf));
			if((unsigned char)buf_in[i] < 0x80)
			{
				buf_out[k] = buf_in[i];
				k++;
			}
			else if((unsigned char)buf_in[i] >= (unsigned char)0xc0)
			{
				win1251char_utf8_3((unsigned char)buf_in[i], buf);
				if(strlen(buf) > 3)
				{
					printf("win1251char_utf8_3 convert error!\n");
					continue;
				}
				strncpy(buf_out+k, buf, 3);
				k += strlen(buf);
			}
			else
			{
				win1251char_utf8_2((unsigned char)buf_in[i], buf);
				if(strlen(buf) > 3)
				{
					printf("win1251char_utf8_2 convert error!\n");
					continue;
				}
				strncpy(buf_out+k, buf, 3);
				k += strlen(buf);
			}
		}
		if(k != fwrite(buf_out, 1, k, fp_out))
		{
			printf("Write file failed!\n");
			break;
		}
		
		memset(buf_out, 0, sizeof(buf_out));
		memset(buf_in, 0, sizeof(buf_in));
	}
	
	fclose(fp_in);
	fclose(fp_out);
	
	return 0;
}



程式碼在linux下編譯執行OK,成功將1251.xml檔案轉換成utf8.xml檔案,現在用大部分編輯器都可以打開了。