C++中ANSI、Unicode16、UTF-8字符串之间的互转

更多文章点击这里

敲黑板

在计算机内部，所有的信息最终都表示为一个二进制的字符串。每一个二进制位（bit）有0和1两种状态，因此八个二进制位就可以组合出256种状态，这被称为一个字节（byte）。也就是说，一个字节一共可以用来表示256种不同的状态，每一个状态对应一个符号，就是256个符号，从0000000到11111111。

CHAR

一种数据类型，代表一个字节，在内存中有8位。

ANSI

美国制定了一套字符编码，对英语字符与二进制位之间的关系，做了统一规定。这被称为ASCII码。

ANSI是默认的编码方式。对于英文文件是ASCII编码，对于简体中文文件是GB2312编码。

注意英文占 1 个字节，汉字 2 个字节，以一个\0结尾。

Unicode

这是一种所有符号的编码，可以容纳100多万个符号。

注意以Unicode16为例每个字符(汉字、英文字母)都占 2 个字节，以 2 个连续的\0结尾。

UTF-8

UTF-8是互联网上使用最广的一种unicode的实现方式。

注意英文占 1 个字节，汉字占 3 个字节。

转换

Unicode16转ANSI

ANSI转Unicode16

Unicode16转UTF-8

UTF-8转Unicode16

Unicode16转ANSI:


string UnicodeToANSI(const wchar_t* str)
{
 char*     pElementText;
 int    iTextLen;
 // wide char to multi char
 iTextLen = WideCharToMultiByte( CP_ACP,
  0,
  str,
  -1,
  NULL,
  0,
  NULL,
  NULL );
 pElementText = new char[iTextLen + 1];
  memset( ( void* )pElementText, 0, sizeof( char ) * ( iTextLen + 1 ) );
 ::WideCharToMultiByte( CP_ACP,
  0,
  str,
  -1,
  pElementText,
  iTextLen,
  NULL,
  NULL );
  string strText;
 strText = pElementText;
 delete[] pElementText;

  return strText;
}

ANSI转Unicode16:


wstring ANSIToUnicode(const string& str)
{
 int  len = 0;
 len = str.length();
 int  unicodeLen = ::MultiByteToWideChar( CP_ACP,
  0,
  str.c_str(),
  -1,
  NULL,
  0 ); 
 wchar_t *  pUnicode; 
 pUnicode = new  wchar_t[unicodeLen+1]; 
 memset(pUnicode,0,(unicodeLen+1)*sizeof(wchar_t)); 
 ::MultiByteToWideChar( CP_ACP,
  0,
  str.c_str(),
  -1,
  (LPWSTR)pUnicode,
  unicodeLen ); 
 wstring  rt; 
 rt = ( wchar_t* )pUnicode;
 delete  pUnicode;

 return  rt; 
}

Unicode16转UTF-8 :


typedef unsigned __int32 uint32_t;
typedef unsigned short  uint16_t;

uint32_t Unicode16ToUTF8(IN const uint16_t* pszUtf16, IN uint32_t nSizeUtf16, IN char* pszUtf8, IN uint32_t nSizeUtf8)  
{  
 if (0 == nSizeUtf8)
 {
  return (nSizeUtf16 * 3 + 1);
 }

    uint32_t i = 0, count = 0;       
    uint16_t wch;  
    for(i=0; i < nSizeUtf16; i++) 
    {
        wch = *(uint16_t*)&pszUtf16[i];       
        if( wch < 0x80)  
        {
   if ((count+1) >= nSizeUtf8)
   {
    break;
   }
            pszUtf8[count] = wch & 0x7f;  
            count++;  
        }  
        else if( wch >= 0x80 && wch < 0x07ff)  
  {
   if ((count+2) >= nSizeUtf8)
   {
    break;
   }
            //tmp1 = wch >> 6;  
            pszUtf8[count] = 0xC0 | (0x1F & wch>>6);  
            pszUtf8[count+1] = 0x80 | (0x3F & wch);  
            count += 2;  
        }  
        else if( wch>=0x0800 )  
  {
   if ((count+3) >= nSizeUtf8)
   {
    break;
   }
            //tmp1 = wch >> 12;  
            pszUtf8[count] = 0xE0 | (0x0F & wch>>12);  
            pszUtf8[count+1] = 0x80 | ((0x0FC0 & wch)>>6);  
            pszUtf8[count+2] = 0x80 | (0x003F & wch);  
             
            count += 3;  
        }     
        else  
        {  
            printf("error/n");  
        }  
    }

 if (count < nSizeUtf8)
 {
  pszUtf8[count] = 0;
  return (count + 1);
 }
 else if (nSizeUtf8 > 0)
 {
  pszUtf8[nSizeUtf8-1] = 0;
  return nSizeUtf8;
 }

    return 0;  
}

UTF-8转Unicode16:


uint32_t UTF8ToUnicode16(IN const unsigned char* pszUtf8, IN uint32_t nSizeUtf8, OUT uint16_t* pszUtf16, IN uint32_t nSizeUtf16)  
{  
 if (0 == nSizeUtf16)
 {
  return (nSizeUtf8 + 1);
 }

    uint32_t count = 0, i = 0;  
    uint16_t wch;  
    uint16_t *p;  
    for(i = 0; count < nSizeUtf8 && i < nSizeUtf16; i++) 
    {
        p = (uint16_t*)&pszUtf16[i];  
         
        if( pszUtf8[count] < 0x80)  
        {  
            wch = pszUtf8[count];             
            count++;  
        }  
        else if( (pszUtf8[count] < 0xDF) && (pszUtf8[count] >= 0x80))  
        {  
            wch = pszUtf8[count] & 0x1F;  
            wch = wch << 6;  
            wch += pszUtf8[count+1] & 0x3F;  
            count += 2;  
        }  
        else if( (pszUtf8[count] <= 0xEF) && (pszUtf8[count] >= 0xDF))  
        {  
            wch = pszUtf8[count] & 0x0F;  
            wch = wch << 6;  
            wch += pszUtf8[count+1] & 0x3F;  
            wch = wch << 6;  
            wch += pszUtf8[count+2] & 0x3F;  
            count += 3;  
        }  
        else  
        {  
            printf("error!/n");  
        }  
        *p = wch;  
    }  

 if (i < nSizeUtf16)
 {
  pszUtf16[i] = 0;
  return (i + 1);
 }
 else if (nSizeUtf16 > 0)
 {
  pszUtf16[nSizeUtf16-1] = 0;
  return nSizeUtf16;
 }
 
    return 0;  
}

欢迎关注我的微博

秒客网

C++中ANSI、Unicode16、UTF-8字符串之间的互转

相关文章