[C/C++]_[初级]_[替换过滤utf8字符串里无效字符]

时间:2023-01-10 17:28:18


场景:

1. 分析数据时,获取到的数据是字符串,但是有可能不是正确的完整的utf8字符串,打印出来或输出到文件时表现出来的就是显示乱码.

这时候就需要过滤掉非法字符使utf8字符串能正确显示, 比如把非法字符替换为#


代码:

1. 这个函数的特性是1个个字符判断, 适合任意长度,任意构造的 utf8 (无效)字符串.

bool IREUtil::FilterUtf8(unsigned char * string,int length){
if(!string)
{
return false;
}

unsigned char * bytes = string;
unsigned char * end = bytes+length;


//10xxxxxx 应该出现个数
int count_s = 0;
//10xxxxxx 剩余个数
int minus_s = 0;
while(bytes != end)
{
if(bytes[0] > 0xF7)
{
if(minus_s)
{
int m = count_s-minus_s+1;
memset((void*)(bytes-m),'#',m);
}
minus_s = 0;
count_s = 0;

bytes[0] = '#';
bytes+=1;
continue;
}

if(bytes[0] <= 0x7F)
{
if(minus_s)
{
int m = count_s-minus_s+1;
memset((void*)(bytes-m),'#',m);
}
minus_s = 0;
count_s = 0;

//过滤掉不可见字符
if((bytes[0] == 0x09 || bytes[0] == 0x0A || bytes[0] == 0x0D ||
(0x20 <= bytes[0] && bytes[0] <= 0x7E)))
{
;
}else
{
bytes[0] = '#';
}

bytes+=1;
continue;
}

if((bytes[0] & 0xF8) == 0xF0)
{
// 1111 0XXX
if(minus_s)
{
int m = count_s-minus_s+1;
memset((void*)(bytes-m),'#',m);
}
count_s = 3;
minus_s = 3;
bytes+=1;
continue;
}

if((bytes[0] & 0xF0) == 0xE0)
{
// 1110 XXXX
if(minus_s)
{
int m = count_s-minus_s+1;
memset((void*)(bytes-m),'#',m);
}
count_s = 2;
minus_s = 2;
bytes+=1;
continue;
}

if((bytes[0] & 0xE0) == 0xC0)
{
// 110X XXXX
if(minus_s)
{
int m = count_s-minus_s+1;
memset((void*)(bytes-m),'#',m);
}
count_s = 1;
minus_s = 1;
bytes+=1;
continue;
}
if((bytes[0] & 0xC0) == 0x80)
{
// 10XX XXXX
if(minus_s)
{
--minus_s;
}else
{
bytes[0] = '#';
}
bytes+=1;
continue;
}

if(minus_s)
{
int m = count_s-minus_s+1;
memset((void*)(bytes-m),'#',m);
}else
{
bytes[0] = '#';
}
minus_s = 0;
count_s = 0;
bytes+=1;
continue;
}

if(minus_s)
{
int m = count_s-minus_s+1;
memset((void*)(bytes-m),'#',m);
}
return true;
}

附送一个网络上下载的判断是否是utf8字符串的另外一种函数,这种方法有问题, 就是必须是符合utf8字符串规则的个数,不然会越界.

原文地址没留下:

bool IREUtil::is_utf8(const unsigned char * string,int length){	if(!string)	{		return false;	}			const unsigned char * bytes = (const unsigned char *)string;	const unsigned char * end = bytes+length;	while(bytes != end)	{		if( (// ASCII			// use bytes[0] <= 0x7F to allow ASCII control characters			bytes[0] == 0x09 ||			bytes[0] == 0x0A ||			bytes[0] == 0x0D ||			(0x20 <= bytes[0] && bytes[0] <= 0x7E)			)			) {				bytes += 1;				continue;		}		if( (// non-overlong 2-byte			(0xC2 <= bytes[0] && bytes[0] <= 0xDF) &&			(0x80 <= bytes[1] && bytes[1] <= 0xBF)			)			) {				bytes += 2;				continue;		}		if( (// excluding overlongs			bytes[0] == 0xE0 &&			(0xA0 <= bytes[1] && bytes[1] <= 0xBF) &&			(0x80 <= bytes[2] && bytes[2] <= 0xBF)			) ||			(// straight 3-byte			((0xE1 <= bytes[0] && bytes[0] <= 0xEC) ||			bytes[0] == 0xEE ||			bytes[0] == 0xEF) &&			(0x80 <= bytes[1] && bytes[1] <= 0xBF) &&			(0x80 <= bytes[2] && bytes[2] <= 0xBF)			) ||			(// excluding surrogates			bytes[0] == 0xED &&			(0x80 <= bytes[1] && bytes[1] <= 0x9F) &&			(0x80 <= bytes[2] && bytes[2] <= 0xBF)			)			) {				bytes += 3;				continue;		}		if( (// planes 1-3			bytes[0] == 0xF0 &&			(0x90 <= bytes[1] && bytes[1] <= 0xBF) &&			(0x80 <= bytes[2] && bytes[2] <= 0xBF) &&			(0x80 <= bytes[3] && bytes[3] <= 0xBF)			) ||			(// planes 4-15			(0xF1 <= bytes[0] && bytes[0] <= 0xF3) &&			(0x80 <= bytes[1] && bytes[1] <= 0xBF) &&			(0x80 <= bytes[2] && bytes[2] <= 0xBF) &&			(0x80 <= bytes[3] && bytes[3] <= 0xBF)			) ||			(// plane 16			bytes[0] == 0xF4 &&			(0x80 <= bytes[1] && bytes[1] <= 0x8F) &&			(0x80 <= bytes[2] && bytes[2] <= 0xBF) &&			(0x80 <= bytes[3] && bytes[3] <= 0xBF)			)			) {				bytes += 4;				continue;		}		return false;	}	return true;}


欢迎指正!