1. 程式人生 > >unicode字元和多位元組字元的相互轉換介面

unicode字元和多位元組字元的相互轉換介面

作者:朱金燦

           發現開原始碼的可利用資源真多,從sqlite3的原始碼中摳出了幾個字元轉換介面,稍微改造下了發現還挺好用的。下面是實現程式碼:

/*
** Convert a UTF-8 string to microsoft unicode (UTF-16?). 
**
** Space to hold the returned string is obtained from malloc.
*/
static WCHAR *utf8ToUnicode(const char *zFilename){

	int nChar;
	WCHAR *zWideFilename;

	nChar = MultiByteToWideChar(CP_UTF8, 0, zFilename, -1, NULL, 0);
	zWideFilename = static_cast<WCHAR *>(malloc(nChar*sizeof(zWideFilename[0])));
	if( zWideFilename==0 ){

		return 0;

	}
	nChar = MultiByteToWideChar(CP_UTF8, 0, zFilename, -1, zWideFilename, nChar);
	if( nChar==0 ){

		free(zWideFilename);
		zWideFilename = 0;

	}
	return zWideFilename;

}

/*
** Convert microsoft unicode to UTF-8.  Space to hold the returned string is
** obtained from malloc().
*/
static char *unicodeToUtf8(const WCHAR *zWideFilename){

	int nByte;
	char *zFilename;

	nByte = WideCharToMultiByte(CP_UTF8, 0, zWideFilename, -1, 0, 0, 0, 0);
	zFilename = static_cast<char*>(malloc( nByte ));
	if( zFilename==0 ){

		return 0;

	}
	nByte = WideCharToMultiByte(CP_UTF8, 0, zWideFilename, -1, zFilename, nByte,
		0, 0);
	if( nByte == 0 )
	{
		free(zFilename);
		zFilename = 0;
	}
	return zFilename;

}

/*
** Convert an ansi string to microsoft unicode, based on the
** current codepage settings for file apis.
** 
** Space to hold the returned string is obtained
** from malloc.
*/
static WCHAR *mbcsToUnicode(const char *zFilename){

	int nByte;
	WCHAR *zMbcsFilename;
	int codepage = AreFileApisANSI() ? CP_ACP : CP_OEMCP;

	nByte = MultiByteToWideChar(codepage, 0, zFilename, -1, NULL,0)*sizeof(WCHAR);
	zMbcsFilename = static_cast<WCHAR*>(malloc( nByte*sizeof(zMbcsFilename[0])));
	if( zMbcsFilename==0 ){

		return 0;

	}
	nByte = MultiByteToWideChar(codepage, 0, zFilename, -1, zMbcsFilename, nByte);
	if( nByte==0 )
	{
		free(zMbcsFilename);
		zMbcsFilename = 0;
	}
	return zMbcsFilename;

}

/*
** Convert microsoft unicode to multibyte character string, based on the
** user's Ansi codepage.
**
** Space to hold the returned string is obtained from
** malloc().
*/
static char* unicodeToMbcs(const WCHAR* zWideFilename){

	int nByte;
	char *zFilename;
	int codepage = AreFileApisANSI() ? CP_ACP : CP_OEMCP;

	nByte = WideCharToMultiByte(codepage, 0, zWideFilename, -1, 0, 0, 0, 0);
	zFilename = static_cast<char*>(malloc(nByte ));
	if( zFilename==0 ){

		return 0;

	}
	nByte = WideCharToMultiByte(codepage, 0, zWideFilename, -1, zFilename, nByte,
		0, 0);
	if( nByte == 0 ){

		free(zFilename);
		zFilename = 0;
	}
	return zFilename;

}

/*
** Convert multibyte character string to UTF-8.  Space to hold the
** returned string is obtained from malloc().
*/
static char* mbcsToUtf8(const char *zFilename){

	char *zFilenameUtf8;
	WCHAR *zTmpWide;

	zTmpWide = mbcsToUnicode(zFilename);
	if( zTmpWide==0 ){

		return 0;

	}
	zFilenameUtf8 = unicodeToUtf8(zTmpWide);
	free(zTmpWide);
	return zFilenameUtf8;
}

/*
** Convert UTF-8 to multibyte character string.  Space to hold the 
** returned string is obtained from malloc().
*/
static char* utf8ToMbcs(const char *zFilename){

	char *zFilenameMbcs;
	WCHAR* zTmpWide;

	zTmpWide = utf8ToUnicode(zFilename);
	if( zTmpWide==0 ){

		return 0;

	}
	zFilenameMbcs = unicodeToMbcs(zTmpWide);
	free(zTmpWide);
	return zFilenameMbcs;
}

std::string MbcsToUtf8( const char* pszMbcs )
{
	std::string str;
	WCHAR   *pwchar=0;
	CHAR    *pchar=0;
	int len=0;
	int codepage = AreFileApisANSI() ? CP_ACP : CP_OEMCP;
	len=MultiByteToWideChar(codepage, 0, pszMbcs, -1, NULL,0);
	pwchar=new WCHAR[len];
	if(pwchar!=0)
	{
		len = MultiByteToWideChar(codepage, 0, pszMbcs, -1, pwchar, len);
		if( len!=0 )
		{
			len = WideCharToMultiByte(CP_UTF8, 0, pwchar, -1, 0, 0, 0, 0);
			pchar=new CHAR[len];
			if(pchar!=0)
			{
				len = WideCharToMultiByte(CP_UTF8, 0, pwchar, -1, pchar, len,0, 0);
				if(len!=0)                
				{
					str = pchar;                   
				}
				delete pchar;
			}
			delete pwchar;
		}
	}
	return str;
}
    

       要測試這些介面,為此我寫了一個測試工程,是讀取一個xml檔案把裡面的字元進行轉換的,測試工程的程式碼下載地址如下: