1. 程式人生 > >c語言判斷是否是utf8字串,計算字元個數

c語言判斷是否是utf8字串,計算字元個數

#include <stdio.h>

#include <string.h>

#include <stdlib.h>

/****************************************************************************

Unicode符號範圍 | UTF-8編碼方式

    (十六進位制) | (二進位制)

0000 0000-0000 007F:0xxxxxxx

0000 0080-0000 07FF:110xxxxx 10xxxxxx

0000 0800-0000 FFFF:1110xxxx 10xxxxxx 10xxxxxx

0001 0000-001F FFFF:11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

0020 0000-03FF FFFF:111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

0400 0000-7FFF FFFF:1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

**************************************************************************/

 

unsigned char utf8_look_for_table[] =

    

{

        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

        

1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,

        4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1};

 

#define UTFLEN(x) utf8_look_for_table[(x)]

 

//根據首位元組,獲取utf8字元所佔位元組數

inline int GetUtf8charByteNum(unsigned char ch)

{

    int byteNum = 0;

    if (ch >= 0xFC && ch < 0xFE)

        byteNum = 6;

    else if (ch >= 0xF8)

        byteNum = 5;

    else if (ch >= 0xF0)

        byteNum = 4;

    else if (ch >= 0xE0)

        byteNum = 3;

    else if (ch >= 0xC0)

        byteNum = 2;

    else if (0 == (ch & 0x80))

        byteNum = 1;

    return byteNum;

}

 

//判斷字串是否是utf8格式

int IsUtf8Format(const char *str)

{

    int byteNum = 0;

    unsigned char ch;

    const char *ptr = str;

    if (NULL == str)

        return 0;

    while (*ptr != '\0')

    {

        ch = (unsigned char)*ptr;

        if (byteNum == 0) //根據首位元組特性判斷該字元的位元組數

        {

            if (0 == (byteNum = GetUtf8charByteNum(ch)))

                return 0;

        }

        else //多位元組字元,非首位元組格式:10xxxxxx

        {

            if ((ch & 0xC0) != 0x80)

                return 0;

        }

        byteNum--;

        ptr++;

    }

    if (byteNum > 0)

        return 0;

    return 1;

}

 

//計算utf8字串字元個數

int GetUtf8Length(char *str)

{

    int clen = 0;

    int len = 0;

    int byteNum = 0;

    unsigned char ch;

    char *ptr = str;

    if (NULL == str)

        return 0;

    clen = strlen(str);

    while (*ptr != '\0' && len < clen)

    {

        ch = (unsigned char)*ptr;

        if (0 == (byteNum = GetUtf8charByteNum(ch)))

            return 0;

        ptr += byteNum;

        len++;

    }

    return len;

}

 

int GetChargeNum(int len)

{

    int num = 0;

    if (len > 70 && len <= 500)

    {

        if (!len % 67)

            num = len / 67;

        else

            num = len / 67 + 1;

    }

    else if (len > 0)

        num = 1;

    return num;

}

 

int main(int argc, char **argv)

{

    //char *str = "hello 你好呀!";

    char *str;

    int len = 0;

    int num = 0;

    if (argc < 2)

        return 0;

    str = argv[1];

    printf("%s\n", str);

    if (!IsUtf8Format(str))

    {

        printf("the text is not the Format of utf8\n");

        return 0;

    }

    if (!(len = GetUtf8Length(str)))

        return 0;

    printf("the length of text: %d\n", len);

    if (!(num = GetChargeNum(len)))

        return 0;

    printf("the chargeNumber of sms: %d\n", num);

    return 1;

}