(C/C++)UTF8字符串中字的切分

时间:2023-01-10 15:36:46
UTF-8 采用变长度字节来表示字符,理论上最多可以到 6 个字节长度。UTF-8 编码兼容了 ASC II(0-127), 也就是说 UTF-8 对于 ASC II 字符的编码是和 ASC II 一样的。对于超过一个字节长度的字符,才用以下编码规范: 
左边第一个字节1的个数表示这个字符编码字节的位数,例如两位字节字符编码样式为为:110xxxxx 10xxxxxx; 三位字节字符的编码样式为:1110xxxx 10xxxxxx 10xxxxxx.;以此类推,六位字节字符的编码样式为:1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx。 xxx 的值由字符编码的二进制表示的位填入。 
1字节:0xxxxxxx 
2字节:110xxxxx 10xxxxxx 
3字节:1110xxxx 10xxxxxx 10xxxxxx 
4字节:11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 
5字节:111110xx 10xxxxxx 10xxxxxx 10xxxxxx
///////////////////////////////////////////////////////////////////////////////
# include <string.h>
# include <vector>

///////////////////////////////////////////////////////////////////////////////
using namespace std;

///////////////////////////////////////////////////////////////////////////////
void fnReadCharactersUTF8( const char* pszSentence, vector<string>& vec )
{
int iLen;
iLen = strlen( pszSentence );

const char* p;

p = pszSentence;

unsigned char * q;

char szCharacter[101];
int iChar;

int iNumChars;
iNumChars = 0;

vec.clear();

string strCharacter;

while ( p != NULL && strlen( p ) > 0 )
{
q = ( unsigned char * ) p;
if ( q[0] < 0x80 )
{
//p[ 0 ] must be an ASCII character
iChar = 0;
szCharacter[iChar++] = p[0];
p++;
q = ( unsigned char * ) p;
while ( p != NULL && q[0] < 0x80 )
{
szCharacter[iChar++] = p[0];
p++;
q = ( unsigned char * ) p;
}
szCharacter[iChar] = '\0';

vec.push_back( string( szCharacter ) );

iNumChars++;
}
else if ( q[0] < 0xC0 )
{
//invalid char between 0x80 and 0xC0
p++;
}
else if ( q[0] < 0xE0 )
{
//two chars
szCharacter[0] = p[0];
szCharacter[1] = p[1];
szCharacter[2] = '\0';
p = p + 2;

strCharacter = string( szCharacter );
vec.push_back( strCharacter );

iNumChars++;
}
else if ( q[0] < 0xF0 )
{
//three chars
szCharacter[0] = p[0];
szCharacter[1] = p[1];
szCharacter[2] = p[2];
szCharacter[3] = '\0';
p = p + 3;

strCharacter = string( szCharacter );
vec.push_back( strCharacter );

//printf( "%s ", strCharacter.c_str( ) );

iNumChars++;
}
else if ( q[0] < 0xF8 )
{
//four chars
p += 4;
}
else if ( q[0] < 0xFC )
{
//five chars
p += 5;
}
else if ( q[0] < 0xFE )
{
//6 chars
p += 5;
}
else
{
//>=0xFE
p++;
}
}
}

////////////////////////////////// FILE END ///////////////////////////////////