C# 關於utf-8的研究

阿新 • • 發佈：2018-11-28

前提

如果一不小心把字元轉成utf8的格式，但是卻產生了亂碼。這個時候要麼就是尋找其他的轉碼方式，要麼就不想要了，直接過濾吧。

這裡說的是直接過濾的辦法。

參考連結

https://netvignettes.wordpress.com/2011/07/03/how-to-detect-encoding/

大概的程式碼解釋

其實主要的思路就是對照這個表（不過貌似它也不是嚴格對照的），比如下面的程式碼就是對於bytes的數量

private static bool IsLead4(byte b)
{
    return b >= 0xF0 && b < 0xF8;
}
 
private static bool IsLead3(byte b)
{
    return b >= 0xE0 && b < 0xF0;
}
private static bool IsLead2(byte b)
{
    return b >= 0xC0 && b < 0xE0;
}
private static bool IsExtendedByte(byte b)
{
    return b > 0x80 && b < 0xC0;
}

接下來就是主要一下特殊字元的邊界情況

if (length >= 4 
)
{
    var one = bytes[offset];
    var two = bytes[offset + 1];
    var three = bytes[offset + 2];
    var four = bytes[offset + 3];
    if (one == 0x2B &&
        two == 0x2F &&
        three == 0x76 &&
        (four == 0x38 || four == 0x39 || four == 0x2B || four == 0x2F 
))
    {
        return UTF7;
    }
    else if (one == 0xFE && two == 0xFF && three == 0x00 && four == 0x00)
    {
        return UTF32;
    }
    else if (four == 0xFE && three == 0xFF && two == 0x00 && one == 0x00)
    {
        throw new NotSupportedException("The byte order mark specifies UTF-32 in big endian order, which is not supported by .NET.");
    }
}
else if (length >= 3)
{
    var one = bytes[offset];
    var two = bytes[offset + 1];
    var three = bytes[offset + 2];
    if (one == 0xFF && two == 0xFE)
    {
        return Unicode;
    }
    else if (one == 0xFE && two == 0xFF)
    {
        return BigEndianUnicode;
    }
    else if (one == 0xEF && two == 0xBB && three == 0xBF)
    {
        return UTF8;
    }
}
if (length > 1)
{
    // Look for a leading < sign:
    if (bytes[offset] == 0x3C)
    {
        if (bytes[offset + 1] == 0x00)
        {
            return Unicode;
        }
        else
        {
            return UTF8;
        }

    }
    else if (bytes[offset] == 0x00 && bytes[offset + 1] == 0x3C)
    {
        return BigEndianUnicode;
    }
}
if (IsUtf8(bytes))
{
    return UTF8;
}

接下來就是測試

static void Main(string[] args)
{
    string ch = "金";
    string Ja = "らなくちゃ";

    string Re = "фыввфывфывфв";

    //byte[] Rom = {209,132,209,34,90,121,5,34,208};
    //byte[] Rom = { 100,200,3,4,5,6,7,8,9,0,0 };
    byte[] Rom = { 187, 170, 200,253,194,183,211,201,198,247,95,53,71,51,54,};
    //byte[] Rom = {}
    byte[] byteArrayUTF8 = UTF8.GetBytes(Ja);
    //byte[] byteArrayDefault = Encoding.Default.GetBytes(Re);

    string Name = Encoding.UTF8.GetString(Rom,0,(int)Rom.Length);

    var y = GetTextEncoding(Rom);
 }

全部程式碼

using System;
using System.Text;
using static System.Text.Encoding;

namespace ConsoleApp1
{
    class Program
    {
        /// <summary>
        /// Determines whether the bytes in this buffer at the specified offset represent a UTF-8 multi-byte character.
        /// </summary>
        /// <remarks>
        /// It is not guaranteed that these bytes represent a sensical character - only that the binary pattern matches UTF-8 encoding.
        /// </remarks>
        /// <param name="bytes">This buffer.</param>
        /// <param name="offset">The position in the buffer to check.</param>
        /// <param name="length">The number of bytes to check, of 4 if not specified.</param>
        /// <returns>The rank of the UTF</returns>
        public static MultibyteRank GetUtf8MultibyteRank(byte[] bytes, int offset = 0, int length = 4)
        {
            if (bytes == null)
            {
                throw new ArgumentNullException("bytes");
            }
            if (offset < 0 || offset > bytes.Length)
            {
                throw new ArgumentOutOfRangeException("offset", "Offset is out of range.");
            }
            else if (length < 0 || length > 4)
            {
                throw new ArgumentOutOfRangeException("length", "Only values 1-4 are valid.");
            }
            else if ((offset + length) > bytes.Length)
            {
                throw new ArgumentOutOfRangeException("offset", "The specified range is outside of the specified buffer.");
            }
            // Possible 4 byte sequence
            if (length > 3 && IsLead4(bytes[offset]))
            {
                if (IsExtendedByte(bytes[offset + 1]) && IsExtendedByte(bytes[offset + 2]) && IsExtendedByte(bytes[offset + 3]))
                {
                    return MultibyteRank.Four;
                }
            }
            // Possible 3 byte sequence
            else if (length > 2 && IsLead3(bytes[offset]))
            {
                if (IsExtendedByte(bytes[offset + 1]) && IsExtendedByte(bytes[offset + 2]))
                {
                    return MultibyteRank.Three;
                }
            }
            // Possible 2 byte sequence
            else if (length > 1 && IsLead2(bytes[offset]) && IsExtendedByte(bytes[offset + 1]))
            {
                return MultibyteRank.Two;
            }
            if (bytes[offset] < 0x80)
            {
                return MultibyteRank.One;
            }
            else
            {
                return MultibyteRank.None;
            }
        }
        private static bool IsLead4(byte b)
        {
            return b >= 0xF0 && b < 0xF8;
        }
        private static bool IsLead3(byte b)
        {
            return b >= 0xE0 && b < 0xF0;
        }
        private static bool IsLead2(byte b)
        {
            return b >= 0xC0 && b < 0xE0;
        }
        private static bool IsExtendedByte(byte b)
        {
            return b > 0x80 && b < 0xC0;
        }
        public enum MultibyteRank
        {
            None = 0,
            One = 1,
            Two = 2,
            Three = 3,
            Four = 4
        }


        public static bool IsUtf8(byte[] bytes, int offset = 0, int? length = null)
        {
            if (bytes == null)
            {
                throw new ArgumentNullException("bytes");
            }
            length = length ?? (bytes.Length - offset);
            if (offset < 0 || offset > bytes.Length)
            {
                throw new ArgumentOutOfRangeException("offset", "Offset is out of range.");
            }
            else if (length < 0)
            {
                throw new ArgumentOutOfRangeException("length");
            }
            else if ((offset + length) > bytes.Length)
            {
                throw new ArgumentOutOfRangeException("offset", "The specified range is outside of the specified buffer.");
            }
            var bytesRemaining = length.Value;
            while (bytesRemaining > 0)
            {
                var rank = GetUtf8MultibyteRank(bytes, offset, Math.Min(4, bytesRemaining));
                if (rank == MultibyteRank.None)
                {
                    return false;
                }
                else
                {
                    var charsRead = (int)rank;
                    offset += charsRead;
                    bytesRemaining -= charsRead;
                }
            }
            return true;
        }

        /// <summary>
        /// Uses various discovery techniques to guess the encoding used for a byte buffer presumably containing text characters.
        /// </summary>
        /// <remarks>
        /// Note that this is only a guess and could be incorrect.  Be prepared to catch exceptions while using the <see cref="Encoding.Decoder"/> returned by
        /// the encoding returned by this method.
        /// </remarks>
        /// <param name="bytes">The buffer containing the bytes to examine.</param>
        /// <param name="offset">The offset into the buffer to begin examination, or 0 if not specified.</param>
        /// <param name="length">The number of bytes to examine.</param>
        /// <returns>An encoding, or <see langword="null"> if one cannot be determined.</returns>
        public static Encoding GetTextEncoding(byte[] bytes, int offset = 0, int? length = null)
        {
            if (bytes == null)
            {
                throw new ArgumentNullException("bytes");
            }
            length = length ?? bytes.Length;
            if (offset < 0 || offset > bytes.Length)
            {
                throw new ArgumentOutOfRangeException("offset", "Offset is out of range.");
            }
            if (length < 0 || length > bytes.Length)
            {
                throw new ArgumentOutOfRangeException("length", "Length is out of range.");
            }
            else if ((offset + length) > bytes.Length)
            {
                throw new ArgumentOutOfRangeException("offset", "The specified range is outside of the specified buffer.");
            }
            // Look for a byte order mark:
            if (length >= 4)
            {
                var one = bytes[offset];
                var two = bytes[offset + 1];
                var three = bytes[offset + 2];
                var four = bytes[offset + 3];
                if (one == 0x2B &&
                    two == 0x2F &&
                    three == 0x76 &&
                    (four == 0x38 || four == 0x39 || four == 0x2B || four == 0x2F))
                {
                    return UTF7;
                }
                else if (one == 0xFE && two == 0xFF && three == 0x00 && four == 0x00)
                {
                    return UTF32;
                }
                else if (four == 0xFE && three == 0xFF && two == 0x00 && one == 0x00)
                {
                    throw new NotSupportedException("The byte order mark specifies UTF-32 in big endian order, which is not supported by .NET.");
                }
            }
            else if (length >= 3)
            {
                var one = bytes[offset];
                var two = bytes[offset + 1];
                var three = bytes[offset + 2];
                if (one == 0xFF && two == 0xFE)
                {
                    return Unicode;
                }
                else if (one == 0xFE && two == 0xFF)
                {
                    return BigEndianUnicode;
                }
                else if (one == 0xEF && two == 0xBB && three == 0xBF)
                {
                    return UTF8;
                }
            }
            if (length > 1)
            {
                // Look for a leading < sign:
                if (bytes[offset] == 0x3C)
                {
                    if (bytes[offset + 1] == 0x00)
                    {
                        return Unicode;
                    }
                    else
                    {
                        return UTF8;
                    }

                }
                else if (bytes[offset] == 0x00 && bytes[offset + 1] == 0x3C)
                {
                    return BigEndianUnicode;
                }
            }
            if (IsUtf8(bytes))
            {
                return UTF8;
            }
            else
            {
                // Impossible to tell.
                return null;
            }
        }
        static void Main(string[] args)
        {
            string ch = "金";
            string Ja = "らなくちゃ";

            string Re = "фыввфывфывфв";

            //byte[] Rom = {209,132,209,34,90,121,5,34,208};
            //byte[] Rom = { 100,200,3,4,5,6,7,8,9,0,0 };
            byte[] Rom = { 187, 170, 200,253,194,183,211,201,198,247,95,53,71,51,54,};
            //byte[] Rom = {}
            byte[] byteArrayUTF8 = UTF8.GetBytes(Ja);
            //byte[] byteArrayDefault = Encoding.Default.GetBytes(Re);

            string Name = Encoding.UTF8.GetString(Rom,0,(int)Rom.Length);

            var y = GetTextEncoding(Rom);
         }
    }
}

c# UTF-8解碼編碼及陣列與List<string>之間轉換等基本知識點總結

Encoding utf8 = Encoding.UTF8; //首先用utf-8進行解碼 &

C++ UTF-8與 Unicode互相轉換

C++ UTF-8 轉 Unicode char* UTF8ToUnicode(char* szUTF8) { int wcscLen = ::MultiByteToWideChar(CP_UTF8, NULL, szUTF8, strlen(szUTF8),

C# UTF-8與GB2312編碼的相互轉化

1、首先引入名稱空間：using System.Text; 2、 GB2312轉化為UTF-8： string LanChange(string str) { Encoding utf8; Encoding gb2312;

C# UTF-8 去BOM頭

在C#中，當使用帶有BOM頭的UTF-8編碼的字串時，一定要注意。 1）如果該字串用作路徑，用來定址。一定會出錯。2）轉換格式時，也很容易出錯。例如字串轉int就一定會出錯。待續… ——————————————————————————————————————

C# 關於utf-8的研究

前提如果一不小心把字元轉成utf8的格式，但是卻產生了亂碼。這個時候要麼就是尋找其他的轉碼方式，要麼就不想要了，直接過濾吧。這裡說的是直接過濾的辦法。參考連結 https://netvignettes.wordpress.com/2011/07/03/how-to-detect-encoding

c#之如何轉換文本文件編碼格式為utf-8

格式 logs 文件編碼 pre str enc cnblogs style ext 如代碼： string content = File.ReadAllText(path, Encoding.Default); File.WriteAllText(path, conte

C# MD5 32位加密 UTF-8編碼

spl 十六進制 post ring one 類型開始出現問題 int 項目開發過程中需要用到MD5加密，最開始的使用使用加密方法： public static string GetMD5(string str) { byte[] b = System

C++ 讀寫utf-8檔案

轉載自：https://blog.csdn.net/sdscscs22/article/details/53895416 UTF-8 UTF-8（8-bit Unicode Transformation Format）是一種針對Unicode的可變長度字元編碼，又稱萬國碼。由Ken Thom

linux下c語言利用iconv函式實現utf-8轉unicode

由於專案中需要轉換原生unicode到ascii的功能，本來想的用的是linux或者windows自帶的寬位元組轉成窄位元組的函式，但由於本身使用了apr_iconv庫，所以直接使用庫函式來解決。期間碰到了庫函式使用一直出錯的問題，一

利用純c++和windows api實現gb2312和utf-8兩種編碼格式的轉換

為什麼同樣的字串在兩臺主機上，會出現一個顯示正常，一個顯示亂碼的情況呢？答案：編碼方式不匹配。解釋：任何內容在計算機中的儲存形式都是二進位制，不論是在記憶體中還是在硬碟中。所以，同一個字串在兩臺主機上的二進位制儲存是一模一樣的。只是將這個二進位制資料呈現時，發生了變化。呈現字串的過程就是

嚴格的C風格字串 Unicode To UTF-8 的實現（C#、JavaScript）

本文是關於 Unicode 也就是 LPWSTR 轉換成 UTF-8 的實現，在 Win32k 平臺中我們可以藉助 “MultiByteToWideChar / WideCharToMultiByte”【核心程式設計】兩個函式進行多位元組與寬位元組字串進行轉換【PS：A

c++中gbk和utf-8互轉

gbk轉utf-8 char* G2U(const char* gb2312) { ASSERT(gb2312!=NULL); int len = MultiByteToWideChar(CP_ACP, 0, gb2312, -1, NULL, 0);

C++中UTF-8轉換成string

本來想著有沒有直接將UTF-8編碼轉換成string的函式，查找了半天，然而並沒有，無奈只好自己封裝了一個，使用了C++11，程式碼如下： std::string MyString::UnicodeT

C++控制檯輸出UTF-8亂碼

找到了一個函式，列印時候用這個函式包裝一下，把UTF-8格式轉成GB2312格式就可以輸出到控制檯了。 static char* U2G(const char* utf8) { int len = MultiByteToWideChar(CP_UTF8, 0, utf8, -1, NULL,

C++ 實現unicode到utf-8的轉碼

思路：獲取字串裡面中的Unicode部分，然後將該部分轉換位utf-8格式的字元，最後將字串裡面的所有Unicode替換為utf-8即可。廢話不多少，直接上程式碼：標頭檔案： /* * charsetEncode.h * * Created on: Jul

C++ ANSI 與 utf-8轉換

[cpp] view plain copy print?//UTF8轉ANSIvoid UTF8toANSI(CString &strUTF8) { //獲取轉換為多位元組後需要的緩衝區大小，建立多位元組緩衝區 UINT nLen

C 和 C++ 的標準庫分別有自己的 locale 操作方法，C 標準庫的 locale 設定函式是 setlocale()，而 C++ 標準庫有 locale 類和流物件的 imbue() 方法（gcc使用zh_CN.GBK，或者zh_CN.UTF-8，VC++使用Chinese_People&#

轉自：http://zyxhome.org/wp/cc-prog-lang/c-stdlib-setlocale-usage-note/ [在此向原文作者說聲謝謝！若有讀者看到文章轉載時請寫該轉載地址，不要寫我的BLOG地址。尊重他人的勞動成果 ^_^ ] C 和 C++ 的標準庫分別有自己的

C# 關於utf-8的研究

前提

參考連結

大概的程式碼解釋

全部程式碼

c# UTF-8解碼編碼及陣列與List<string>之間轉換等基本知識點總結

C++ UTF-8與 Unicode互相轉換

C# UTF-8與GB2312編碼的相互轉化

C# UTF-8 去BOM頭

C# 關於utf-8的研究

c#之如何轉換文本文件編碼格式為utf-8

C# MD5 32位加密 UTF-8編碼

C++ 讀寫utf-8檔案

linux下c語言利用iconv函式實現utf-8轉unicode

利用純c++和windows api實現gb2312和utf-8兩種編碼格式的轉換

嚴格的C風格字串 Unicode To UTF-8 的實現（C#、JavaScript）

c++中gbk和utf-8互轉

C++中UTF-8轉換成string

C++控制檯輸出UTF-8亂碼

C++ 實現unicode到utf-8的轉碼

C++ ANSI 與 utf-8轉換

C 和 C++ 的標準庫分別有自己的 locale 操作方法，C 標準庫的 locale 設定函式是 setlocale()，而 C++ 標準庫有 locale 類和流物件的 imbue() 方法（gcc使用zh_CN.GBK，或者zh_CN.UTF-8，VC++使用Chinese_People&#

[轉載] c++中UTF-8到ANSI的轉換

C++中GB2312字串和UTF-8之間的轉換-json中文亂碼問題

MFC/C++ CFile寫入檔案資料，輸出utf-8的文字。(防止中文亂碼)

C# 關於utf-8的研究

前提

參考連結

大概的程式碼解釋

全部程式碼

相關推薦