UNICODE(UTF-16)与UTF-8编码的相互转换

我们通常所说的UNICODE其实是UTF-16，下面这几个函数实现UNICODE(UTF-16)与UTF-8编码的相互转换。

[cpp] view plain copy

/**
* This file implement functions of:
*
* 1. UTF-16 character to UTF-8 chaaracter converting.
* 2. UTF-8 character to UTF-16 character converting.
*
* 3. UTF-16 string to UTF-8 string converting.
* 4. UTF-8 string to UTF-16 string converting.
*/
/* Maximum bytes of a utf-8 character */
#define MAX_CHARACTER_SIZE 8
/**
* UnicodeToUTF8 - convert unicode char to UTF-8 char
* @unicode: a UNICODE(utf-16) character
* @p: a buffer to contain a utf-8 characters
*
* @return: One step over the end of the utf-8 character buffer
*/
unsigned char * UnicodeToUTF8( int unicode, unsigned char *p)
{
unsigned char *e = NULL;
if((e = p))
{
if(unicode < 0x80)
{
*e++ = unicode;
}
else if(unicode < 0x800)
{
/* <11011111> < 000 0000 0000> */
*e++ = ((unicode >> 6) & 0x1f)|0xc0;
*e++ = (unicode & 0x3f)|0x80;
}
else if(unicode < 0x10000)
{
/* <11101111> <0000 0000 0000 0000> */
*e++ = ((unicode >> 12) & 0x0f)|0xe0;
*e++ = ((unicode >> 6) & 0x3f)|0x80;
*e++ = (unicode & 0x3f)|0x80;
}
else if(unicode < 0x200000)
{
/* <11110111> <0 0000 0000 0000 0000 0000> */
*e++ = ((unicode >> 18) & 0x07)|0xf0;
*e++ = ((unicode >> 12) & 0x3f)|0x80;
*e++ = ((unicode >> 6) & 0x3f)|0x80;
*e++ = (unicode & 0x3f)|0x80;
}
else if(unicode < 0x4000000)
{
/* <11111011> <00 0000 0000 0000 0000 0000 0000> */
*e++ = ((unicode >> 24) & 0x03)|0xf8 ;
*e++ = ((unicode >> 18) & 0x3f)|0x80;
*e++ = ((unicode >> 12) & 0x3f)|0x80;
*e++ = ((unicode >> 6) & 0x3f)|0x80;
*e++ = (unicode & 0x3f)|0x80;
}
else
{
/* <11111101> <0000 0000 0000 0000 0000 0000 0000 0000> */
*e++ = ((unicode >> 30) & 0x01)|0xfc;
*e++ = ((unicode >> 24) & 0x3f)|0x80;
*e++ = ((unicode >> 18) & 0x3f)|0x80;
*e++ = ((unicode >> 12) & 0x3f)|0x80;
*e++ = ((unicode >> 6) & 0x3f)|0x80;
*e++ = (unicode & 0x3f)|0x80;
}
}
/* Return One step over the end of the utf-8 character buffer */
return e;
}
/**
* UTF8ToUnicode - convert UTF-8 char to unicode char
* @ch: A buffer contain a utf-8 character
* @unicode: Contain the converted utf-16 character
*
* @return: Bytes count of the utf-8 character (1 ~ 6),
* can be used to step to next utf-8 character when convert a utf-8 string to a utf-16 string
*/
int UTF8ToUnicode (unsigned char *ch, int *unicode)
{
unsigned char *p = NULL;
int e = 0, n = 0;
if((p = ch) && unicode)
{
if(*p >= 0xfc)
{
/* 6:<11111100> */
e = (p[0] & 0x01) << 30;
e |= (p[1] & 0x3f) << 24;
e |= (p[2] & 0x3f) << 18;
e |= (p[3] & 0x3f) << 12;
e |= (p[4] & 0x3f) << 6;
e |= (p[5] & 0x3f);
n = 6;
}
else if(*p >= 0xf8)
{
/* 5:<11111000> */
e = (p[0] & 0x03) << 24;
e |= (p[1] & 0x3f) << 18;
e |= (p[2] & 0x3f) << 12;
e |= (p[3] & 0x3f) << 6;
e |= (p[4] & 0x3f);
n = 5;
}
else if(*p >= 0xf0)
{
/* 4:<11110000> */
e = (p[0] & 0x07) << 18;
e |= (p[1] & 0x3f) << 12;
e |= (p[2] & 0x3f) << 6;
e |= (p[3] & 0x3f);
n = 4;
}
else if(*p >= 0xe0)
{
/* 3:<11100000> */
e = (p[0] & 0x0f) << 12;
e |= (p[1] & 0x3f) << 6;
e |= (p[2] & 0x3f);
n = 3;
}
else if(*p >= 0xc0)
{
/* 2:<11000000> */
e = (p[0] & 0x1f) << 6;
e |= (p[1] & 0x3f);
n = 2;
}
else
{
e = p[0];
n = 1;
}
*unicode = e;
}
/* Return bytes count of this utf-8 character */
return n;
}
/**
* UnicodeStrToUTF8Str - Convert a utf-16 string to a utf-8 string
* @unicde_str: A utf-16 string
* @utf8_str: A buffer to contain utf-8 string
* @utf8_str_size: Maximum size of the utf-8 string buffer
*
* @return: One step over the end of the last utf-8 character
*/
unsigned char * UnicodeStrToUTF8Str (unsigned short * unicode_str,
unsigned char * utf8_str, int utf8_str_size)
{
int unicode = 0;
unsigned char *e = NULL, *s = NULL;
unsigned char utf8_ch[MAX_CHARACTER_SIZE];
s = utf8_str;
if ((unicode_str) && (s))
{
while ((unicode = (int) (*unicode_str++)))
{
memset (utf8_ch, 0, sizeof (utf8_ch));
if ((e = UnicodeToUTF8 (unicode, utf8_ch)) > utf8_ch)
{
*e = '/0';
/* Judge whether exceed the destination buffer */
if ((s - utf8_str + strlen ((const char *) utf8_ch)) >= utf8_str_size)
{
return s;
}
else
{
memcpy (s, utf8_ch, strlen ((const char *) utf8_ch));
s += strlen ((const char *) utf8_ch);
*s = '/0';
}
}
else
{
/* Converting error occurs */
return s;
}
}
}
return s;
}
/**
* UTF8StrToUnicodeStr - Convert a utf-8 stirng to a utf-16 string
* @utf8_str: A utf-8 string
* @unicode_str: A buffer to contain utf-16 string
* @unicode_str_size: Maximum size of the utf-16 string buffer
*
* @return: Number of utf-16 character
*/
int UTF8StrToUnicodeStr (unsigned char * utf8_str,
unsigned short * unicode_str, int unicode_str_size)
{
int unicode = 0;
int n = 0;
int count = 0;
unsigned char *s = NULL;
unsigned short *e = NULL;
s = utf8_str;
e = unicode_str;
if ((utf8_str) && (unicode_str))
{
while (*s)
{
if ((n = UTF8ToUnicode (s, &unicode)) > 0)
{
if ((count + 1) >= unicode_str_size)
{
return count;
}
else
{
*e = (unsigned short) unicode;
e++;
*e = 0;
/* Step to next utf-8 character */
s += n;
}
}
else
{
/* Converting error occurs */
return count;
}
}
}
return count;
}

秒客网

UNICODE(UTF-16)与UTF-8编码的相互转换

相关文章