过滤utf8 字符中超过三个字节的字符,或者非utf8字符

时间:2023-01-10 17:56:31
function filterUtf8($str)
{
/*utf8 编码表:
* Unicode符号范围 | UTF-8编码方式
* u0000 0000 - u0000 007F | 0xxxxxxx
* u0000 0080 - u0000 07FF | 110xxxxx 10xxxxxx
* u0000 0800 - u0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
*
*/
$re = '';
$str = str_split(bin2hex($str), 2);

$mo = 1<<7;
$mo2 = $mo | (1 << 6);
$mo3 = $mo2 | (1 << 5); //三个字节
$mo4 = $mo3 | (1 << 4); //四个字节
$mo5 = $mo4 | (1 << 3); //五个字节
$mo6 = $mo5 | (1 << 2); //六个字节


for ($i = 0; $i < count($str); $i++)
{
if ((hexdec($str[$i]) & ($mo)) == 0)
{
$re .= chr(hexdec($str[$i]));
continue;
}

//4字节 及其以上舍去
if ((hexdec($str[$i]) & ($mo6) ) == $mo6)
{
$i = $i +5;
continue;
}

if ((hexdec($str[$i]) & ($mo5) ) == $mo5)
{
$i = $i +4;
continue;
}

if ((hexdec($str[$i]) & ($mo4) ) == $mo4)
{
$i = $i +3;
continue;
}

if ((hexdec($str[$i]) & ($mo3) ) == $mo3 )
{
$i = $i +2;
if (((hexdec($str[$i]) & ($mo) ) == $mo) && ((hexdec($str[$i - 1]) & ($mo) ) == $mo) )
{
$r = chr(hexdec($str[$i - 2])).
chr(hexdec($str[$i - 1])).
chr(hexdec($str[$i]));
$re .= $r;
}
continue;
}



if ((hexdec($str[$i]) & ($mo2) ) == $mo2 )
{
$i = $i +1;
if ((hexdec($str[$i]) & ($mo) ) == $mo)
{
$re .= chr(hexdec($str[$i - 1])) . chr(hexdec($str[$i]));
}
continue;
}
}
return $re;
}