一个高效过滤非UTF8字符的C函数(也可用来判断是否utf8)

时间:2023-03-10 04:37:02
一个高效过滤非UTF8字符的C函数(也可用来判断是否utf8)
/*
UTF-8 valid format list:
0xxxxxxx
110xxxxx 10xxxxxx
1110xxxx 10xxxxxx 10xxxxxx
11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
char *filter_none_utf8_chars(char *src, int *len)
{
unsigned char *p;
unsigned char *pSub;
unsigned char *pStrEnd;
unsigned char *pCharEnd;
int bytes;
unsigned char *filtered;
unsigned char *pDest;
unsigned char *pInvalidCharStart; pStrEnd = (unsigned char *)src + (*len);
p = (unsigned char *)src;
pInvalidCharStart = NULL;
while (p < pStrEnd)
{
if (*p < 0x80)
{
p++;
continue;
} if ((*p & 0xE0) == 0xC0) //110xxxxx
{
bytes = ;
}
else if ((*p & 0xF0) == 0xE0) //1110xxxx
{
bytes = ;
}
else if ((*p & 0xF8) == 0xF0) //11110xxx
{
bytes = ;
}
else if ((*p & 0xFC) == 0xF8) //111110xx {
bytes = ;
}
else if ((*p & 0xFE) == 0xFC) //1111110x
{
bytes = ;
}
else
{
pInvalidCharStart = p;
break;
} p++;
pCharEnd = p + bytes;
if (pCharEnd > pStrEnd)
{
pInvalidCharStart = p - ;
break;
} for (; p<pCharEnd; p++)
{
if ((*p & 0xC0) != 0x80)
{
break;
}
} if (p != pCharEnd)
{
pInvalidCharStart = pCharEnd - (bytes + );
break;
}
} if (pInvalidCharStart == NULL) //all chars are valid
{
return src;
} filtered = (unsigned char *)malloc(sizeof(char) * (*len));
if (filtered == NULL)
{
*len = ;
*src = '\0';
return src;
} pDest = filtered;
bytes = (char *)pInvalidCharStart - src;
if (bytes > )
{
memcpy(pDest, src, bytes);
pDest += bytes;
} p = pInvalidCharStart + ; //skip this invalid char
while (p < pStrEnd)
{
if (*p < 0x80)
{
*pDest++ = *p++;
continue;
} if ((*p & 0xE0) == 0xC0) //110xxxxx
{
bytes = ;
}
else if ((*p & 0xF0) == 0xE0) //1110xxxx
{
bytes = ;
}
else if ((*p & 0xF8) == 0xF0) //11110xxx
{
bytes = ;
}
else if ((*p & 0xFC) == 0xF8) //111110xx
{
bytes = ;
}
else if ((*p & 0xFE) == 0xFC) //1111110x
{
bytes = ;
} else //invalid char
{
p++;
continue;
} pSub = p + ;
pCharEnd = pSub + bytes;
if (pCharEnd > pStrEnd)
{
p++;
continue;
} for (; pSub<pCharEnd; pSub++)
{
if ((*pSub & 0xC0) != 0x80)
{
break;
}
} if (pSub != pCharEnd)
{
p++;
continue;
} bytes += ;
memcpy(pDest, pSub-bytes, bytes);
pDest += bytes;
p += bytes;
} *len = pDest - filtered;
memcpy(src, filtered, *len);
* (src + (*len)) = '\0'; free(filtered); return src;
}

http://bbs.chinaunix.net/forum.php?mod=viewthread&tid=1230313