UTF-8字符串在lua中的字长问题

时间:2023-01-11 14:43:39

UTF-8字符串在lua中每个字符的字长是不一样,中文,英文,其他字符。都有各自不同的字长,所以UTF-8也叫作变长字符。规则如下:

1. utf8字符是变长字符

2. 字符长度有规律

UTF-8字符规律

字符串的首个byte表示了该utf8字符的长度

0xxxxxxx - 1 byte

110yxxxx - 192, 2 byte

1110yyyy - 225, 3 byte

11110zzz - 240, 4 byte



所以,要想满足自己对字符串按字符数量截取的话,就要进行一下包装了,直接贴代码,如下:

local UTF8 = {}

function UTF8.chSize(char)
if not char then
return 0
elseif char > 240 then
return 4
elseif char > 225 then
return 3
elseif char > 192 then
return 2
else
return 1
end
end


function UTF8.sub(str, startChar, numChars)
if str == nil then
return ""
end
local startIndex = 1
if (startChar==nil) then
startChar = 1;
end
if (numChars==nil) then
numChars =15;
end;

local allChars = numChars

while startChar > 1 do
local char = string.byte(str, startIndex)
startIndex = startIndex + UTF8.chSize(char)
startChar = startChar - 1
end

local currentIndex = startIndex
while currentIndex <= numChars and currentIndex <= #str do
local char = string.byte(str, currentIndex)
currentIndex = currentIndex + UTF8.chSize(char)
end

if numChars < #str then
return str:sub(startIndex, currentIndex - 1).."..."
else
return str:sub(startIndex, currentIndex - 1)
end


end

function UTF8.length(str)
local length = 0;
local currentIndex = 1;
while currentIndex <= #str do
local char = string.byte(str, currentIndex)
currentIndex = currentIndex + UTF8.chSize(char)
length = length + 1
end
return length;
end

function UTF8.toString(str)
if type(str)=="string" then
return str
end
local char = ""
if type(str)~="table" then
return char
end
for key, var in pairs(str) do
if var == "\0" then
break
end
char = char .. var
end
return char
end
return UTF8;