1. 程式人生 > >字串UTF-8和GBK之間的轉換以及判定

字串UTF-8和GBK之間的轉換以及判定

一、判定字串是否是UTF-8的編碼

bool is_str_utf8(const char* str)
{
    unsigned int nBytes = 0;//UFT8可用1-6個位元組編碼,ASCII用一個位元組  
    unsigned char chr = *str;
    bool bAllAscii = true;
 
    for (unsigned int i = 0; str[i] != '\0'; ++i)
    {
        chr = *(str + i);
        //判斷是否ASCII編碼,如果不是,說明有可能是UTF8,ASCII用7位編碼,最高位標記為0,0xxxxxxx 
if (nBytes == 0 && (chr & 0x80) != 0) { bAllAscii = false; } if (nBytes == 0) { //如果不是ASCII碼,應該是多位元組符,計算位元組數 if (chr >= 0x80) { if (chr >= 0xFC && chr <= 0xFD) { nBytes
= 6; } else if (chr >= 0xF8) { nBytes = 5; } else if (chr >= 0xF0) { nBytes = 4; } else if (chr >= 0xE0) { nBytes
= 3; } else if (chr >= 0xC0) { nBytes = 2; } else { return false; } nBytes--; } } else { //多位元組符的非首位元組,應為 10xxxxxx if ((chr & 0xC0) != 0x80) { return false; } //減到為零為止 nBytes--; } } //違返UTF8編碼規則 if (nBytes != 0) { return false; } if (bAllAscii) { //如果全部都是ASCII, 也是UTF8 return true; } return true; }

 

二、判定字串是否是GBk的編碼

bool is_str_gbk(const char* str)
{
    unsigned int nBytes = 0;//GBK可用1-2個位元組編碼,中文兩個 ,英文一個 
    unsigned char chr = *str;
    bool bAllAscii = true; //如果全部都是ASCII,  
 
    for (unsigned int i = 0; str[i] != '\0'; ++i)
    {
        chr = *(str + i);
        if ((chr & 0x80) != 0 && nBytes == 0)
        {// 判斷是否ASCII編碼,如果不是,說明有可能是GBK
            bAllAscii = false;
        }
 
        if (nBytes == 0) 
        {
            if (chr >= 0x80) 
            {
                if (chr >= 0x81 && chr <= 0xFE)
                {
                    nBytes = +2;
                }
                else
                {
                    return false;
                }
                nBytes--;
            }
        }
        else
        {
            if (chr < 0x40 || chr>0xFE)
            {
                return false;
            }
            nBytes--;
        }//else end
    }
 
    if (nBytes != 0)  
    {    //違返規則 
        return false;
    }
 
    if (bAllAscii)
    { //如果全部都是ASCII, 也是GBK
        return true;
    }
 
    return true;
}

 

三、字串由GBk編碼轉換成UTF-8編碼

void ConvertGBKToUtf8(CString &strGBK)
 {
  int len=MultiByteToWideChar(CP_ACP, 0, (LPCTSTR)strGBK, -1, NULL,0);
  wchar_t * wszUtf8 = new wchar_t [len];
  memset(wszUtf8, 0, len);
  MultiByteToWideChar(CP_ACP, 0, (LPCTSTR)strGBK, -1, wszUtf8, len);
  len = WideCharToMultiByte(CP_UTF8, 0, wszUtf8, -1, NULL, 0, NULL, NULL);
  char *szUtf8=new char[len + 1];
  memset(szUtf8, 0, len + 1);
  WideCharToMultiByte (CP_UTF8, 0, wszUtf8, -1, szUtf8, len, NULL,NULL);
  strGBK = szUtf8;
  delete[] szUtf8;
  delete[] wszUtf8;
 }


string GBKToUTF8(const char* strGBK)  
{  
    int len = MultiByteToWideChar(CP_ACP, 0, strGBK, -1, NULL, 0);  
    wchar_t* wstr = new wchar_t[len+1];  
    memset(wstr, 0, len+1);  
    MultiByteToWideChar(CP_ACP, 0, strGBK, -1, wstr, len);  
    len = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, NULL, 0, NULL, NULL);  
    char* str = new char[len+1];  
    memset(str, 0, len+1);  
    WideCharToMultiByte(CP_UTF8, 0, wstr, -1, str, len, NULL, NULL);  
    string strTemp = str;  
    if(wstr) delete[] wstr;  
    if(str) delete[] str;  
    return strTemp;  
}  

 

四、字串由UTF-8編碼轉換成GBk編碼

string UtfToGbk(const char* utf8)
{
    int len = MultiByteToWideChar(CP_UTF8, 0, utf8, -1, NULL, 0);
    wchar_t* wstr = new wchar_t[len+1];
    memset(wstr, 0, len+1);
    MultiByteToWideChar(CP_UTF8, 0, utf8, -1, wstr, len);
    len = WideCharToMultiByte(CP_ACP, 0, wstr, -1, NULL, 0, NULL, NULL);
    char* str = new char[len+1];
    memset(str, 0, len+1);
    WideCharToMultiByte(CP_ACP, 0, wstr, -1, str, len, NULL, NULL);
    if(wstr) delete[] wstr;
    return str;
}

bool Utf82gbk(std::string &gbkStr, std::string &srcStr)
{
 
    //首先先將utf-8編碼轉換為unicode編碼   
    if(NULL==setlocale(LC_ALL,"zh_CN.utf8"))//設定轉換為unicode前的碼,當前為utf8編碼   
    {
        printf("Bad Parameter\n");
        return false;
    }
 
    int unicodeLen=mbstowcs(NULL,srcStr.c_str(),0);//計算轉換後的長度   
    if(unicodeLen<=0)
    {
        printf("Can not Transfer!!!\n");
        return false;
    }
    wchar_t *unicodeStr=(wchar_t *)calloc(sizeof(wchar_t),unicodeLen+1);
    mbstowcs(unicodeStr,srcStr.c_str(),srcStr.size());//將gbk轉換為unicode   
 
    //將unicode編碼轉換為gbk編碼   
    if(NULL==setlocale(LC_ALL,"zh_CN.gbk"))//設定unicode轉換後的碼,當前為gbk   
    {
        printf("Bad Parameter\n");
        return false;
    }
    int gbkLen = wcstombs(NULL,unicodeStr,0);//計算轉換後的長度   
    if(gbkLen<=0)
    {
        printf("Can not Transfer!!!\n");
        return false;
    }
    char gbkbuf[1024*10];
    wcstombs(gbkbuf,unicodeStr,gbkLen);
    gbkbuf[gbkLen]=0;//新增結束符   
    gbkStr = gbkbuf;
    free(unicodeStr);
    return true;
}


string UTF8ToGBK(const std::string& strUTF8)    
{    
    int len = MultiByteToWideChar(CP_UTF8, 0, strUTF8.c_str(), -1, NULL, 0);    
    WCHAR* wszGBK = new WCHAR[len+1];  
    memset(wszGBK, 0, len * 2 + 2);    
    MultiByteToWideChar(CP_UTF8, 0, (LPCSTR)(LPCTSTR)strUTF8.c_str(), -1, wszGBK, len);    
  
    len = WideCharToMultiByte(CP_ACP, 0, wszGBK, -1, NULL, 0, NULL, NULL);    
    char *szGBK = new char[len + 1];    
    memset(szGBK, 0, len + 1);    
    WideCharToMultiByte(CP_ACP,0, wszGBK, -1, szGBK, len, NULL, NULL);     
    std::string strTemp(szGBK);    
    delete[]szGBK;    
    delete[]wszGBK;    
    return strTemp;    
}