C語言實現windows1251編碼轉utf-8編碼
阿新 • • 發佈:2019-01-23
windows1251是俄羅斯本地的一種編碼,不通用。mini xml好像無法解析,但客戶就是提供一個這種編碼檔案的url,讓你解析裡面的資料。
開源的編碼轉換庫又太大,只好用C語言寫一個。有些字元轉換沒什麼規律,只能一一對應,很耿直的轉換方式,哈哈~~,不過以後如果遇到相同的問題,不用再寫一次,節省時間。
這個程式實現將一個windows1251編碼的檔案轉換成utf-8編碼的檔案。
#include <stdio.h> #include <string.h> // 內碼小於0x80的字元轉換成utf-8 void win1251char_utf8_1(unsigned char char_in, unsigned char* out) { out[0] = char_in; } // 內碼值在0x80-0xBF之間的字元轉換成utf-8(中間64個字元沒有規律) void win1251char_utf8_2(unsigned char char_in, unsigned char* out) { switch(char_in) { case 0x80: out[0] = 0xd0; out[1] = 0x82; break; case 0x81: out[0] = 0xd0; out[1] = 0x83; break; case 0x82: out[0] = 0xe2; out[1] = 0x80; out[2] = 0x9a; break; case 0x83: out[0] = 0xd1; out[1] = 0x93; break; case 0x84: out[0] = 0xe2; out[1] = 0x80; out[2] = 0x9e; break; case 0x85: out[0] = 0xe2; out[1] = 0x80; out[2] = 0xa6; break; case 0x86: out[0] = 0xe2; out[1] = 0x80; out[2] = 0xa0; break; case 0x87: out[0] = 0xe2; out[1] = 0x80; out[2] = 0xa1; break; case 0x88: out[0] = 0xe2; out[1] = 0x82; out[2] = 0xac; break; case 0x89: out[0] = 0xe2; out[1] = 0x80; out[2] = 0xb0; break; case 0x8a: out[0] = 0xd0; out[1] = 0x89; break; case 0x8b: out[0] = 0xe2; out[1] = 0x80; out[2] = 0xb9; break; case 0x8c: out[0] = 0xd0; out[1] = 0x8a; break; case 0x8d: out[0] = 0xd0; out[1] = 0x8c; break; case 0x8e: out[0] = 0xd0; out[1] = 0x8b; break; case 0x8f: out[0] = 0xd0; out[1] = 0x8f; break; case 0x90: out[0] = 0xd1; out[1] = 0x92; break; case 0x91: out[0] = 0xe2; out[1] = 0x80; out[2] = 0x98; break; case 0x92: out[0] = 0xe2; out[1] = 0x80; out[2] = 0x99; break; case 0x93: out[0] = 0xe2; out[1] = 0x80; out[2] = 0x9c; break; case 0x94: out[0] = 0xe2; out[1] = 0x80; out[2] = 0x9d; break; case 0x95: out[0] = 0xe2; out[1] = 0x80; out[2] = 0xa2; break; case 0x96: out[0] = 0xe2; out[1] = 0x80; out[2] = 0x93; break; case 0x97: out[0] = 0xe2; out[1] = 0x80; out[2] = 0x94; break; case 0x98: // 為空? break; case 0x99: out[0] = 0xe2; out[1] = 0x84; out[2] = 0xa2; break; case 0x9a: out[0] = 0xd1; out[1] = 0x99; break; case 0x9b: out[0] = 0xe2; out[1] = 0x80; out[2] = 0xba; break; case 0x9c: out[0] = 0xd1; out[1] = 0x9a; break; case 0x9d: out[0] = 0xd1; out[1] = 0x9c; break; case 0x9e: out[0] = 0xd1; out[1] = 0x9b; break; case 0x9f: out[0] = 0xd1; out[1] = 0x9f; break; case 0xa0: out[0] = 0xc2; out[1] = 0xa0; break; case 0xa1: out[0] = 0xd0; out[1] = 0x8e; break; case 0xa2: out[0] = 0xd1; out[1] = 0x9e; break; case 0xa3: out[0] = 0xd0; out[1] = 0x88; break; case 0xa4: out[0] = 0xc2; out[1] = 0xa4; break; case 0xa5: out[0] = 0xd2; out[1] = 0x90; break; case 0xa6: out[0] = 0xc2; out[1] = 0xa6; break; case 0xa7: out[0] = 0xc2; out[1] = 0xa7; break; case 0xa8: out[0] = 0xd0; out[1] = 0x81; break; case 0xa9: out[0] = 0xc2; out[1] = 0xa9; break; case 0xaa: out[0] = 0xd0; out[1] = 0x84; break; case 0xab: out[0] = 0xc2; out[1] = 0xab; break; case 0xac: out[0] = 0xc2; out[1] = 0xac; break; case 0xad: out[0] = 0xc2; out[1] = 0xad; break; case 0xae: out[0] = 0xc2; out[1] = 0xae; break; case 0xaf: out[0] = 0xd0; out[1] = 0x87; break; case 0xb0: out[0] = 0xc2; out[1] = 0xb0; break; case 0xb1: out[0] = 0xc2; out[1] = 0xb1; break; case 0xb2: out[0] = 0xd0; out[1] = 0x86; break; case 0xb3: out[0] = 0xd1; out[1] = 0x96; break; case 0xb4: out[0] = 0xd2; out[1] = 0x91; break; case 0xb5: out[0] = 0xc2; out[1] = 0xb5; break; case 0xb6: out[0] = 0xc2; out[1] = 0xb6; break; case 0xb7: out[0] = 0xc2; out[1] = 0xb7; break; case 0xb8: out[0] = 0xd1; out[1] = 0x91; break; case 0xb9: out[0] = 0xe2; out[1] = 0x84; out[2] = 0x96; break; case 0xba: out[0] = 0xd1; out[1] = 0x94; break; case 0xbb: out[0] = 0xc2; out[1] = 0xbb; break; case 0xbc: out[0] = 0xd1; out[1] = 0x98; break; case 0xbd: out[0] = 0xd0; out[1] = 0x85; break; case 0xbe: out[0] = 0xd1; out[1] = 0x95; break; case 0xbf: out[0] = 0xd1; out[1] = 0x97; break; } } // 0xC0-0xFF之間的字元轉換成utf-8 void win1251char_utf8_3(unsigned char char_in, unsigned char* out) { if(char_in <= (unsigned char)0xef) { out[0] = 0xd0; out[1] = char_in - 48; } else { out[0] = 0xd1; out[1] = char_in - 112; } } int main() { int read_len = 100, real_len = 0; int i = 0, k = 0; char buf_in[101] = {0}; char buf_out[301] = {0}; char buf[4] = {0}; char* file_in = "1251.xml"; char* file_out = "utf8.xml"; FILE* fp_in = fopen(file_in, "rb"); FILE* fp_out = fopen(file_out, "w"); if(fp_in == NULL || fp_out == NULL) { printf("Open file failed!\n"); return -1; } while((real_len = fread(buf_in, 1, read_len, fp_in)) != 0) { if(real_len > read_len) break; for(i=0,k=0;i<real_len;i++) { memset(buf, 0, sizeof(buf)); if((unsigned char)buf_in[i] < 0x80) { buf_out[k] = buf_in[i]; k++; } else if((unsigned char)buf_in[i] >= (unsigned char)0xc0) { win1251char_utf8_3((unsigned char)buf_in[i], buf); if(strlen(buf) > 3) { printf("win1251char_utf8_3 convert error!\n"); continue; } strncpy(buf_out+k, buf, 3); k += strlen(buf); } else { win1251char_utf8_2((unsigned char)buf_in[i], buf); if(strlen(buf) > 3) { printf("win1251char_utf8_2 convert error!\n"); continue; } strncpy(buf_out+k, buf, 3); k += strlen(buf); } } if(k != fwrite(buf_out, 1, k, fp_out)) { printf("Write file failed!\n"); break; } memset(buf_out, 0, sizeof(buf_out)); memset(buf_in, 0, sizeof(buf_in)); } fclose(fp_in); fclose(fp_out); return 0; }
程式碼在linux下編譯執行OK,成功將1251.xml檔案轉換成utf8.xml檔案,現在用大部分編輯器都可以打開了。