解决string.len 处理 utf-8 中文字符不正确的问题
解决string.len 处理 utf-8 中文字符不正确的问题。因为这个返回的结果是字节数(不是字符数),比如 UTF-8 里一个中文得到 3,在 GBK 里一个中文得到2。
据说是有个 string.utf8len 还是 string.lenutf8 可以用,但是试了一下发现貌似没有这个函数
幸亏GitHub上有大神贴出了代码,效果非常好
https://github.com/airtonix/ouf_tags/blob/master/utf8.lua
贴出主要部分
local function utf8charbytes (s, i) -- argument defaults i = i or 1 -- argument checking if type(s) ~= "string" then error("bad argument #1 to "utf8charbytes" (string expected, got ".. type(s).. ")") end if type(i) ~= "number" then error("bad argument #2 to "utf8charbytes" (number expected, got ".. type(i).. ")") end local c = s:byte(i) -- determine bytes needed for character, based on RFC 3629 -- validate byte 1 if c > 0 and c <= 127 then -- UTF8-1 return 1 elseif c >= 194 and c <= 223 then -- UTF8-2 local c2 = s:byte(i + 1) if not c2 then error("UTF-8 string terminated early") end -- validate byte 2 if c2 < 128 or c2 > 191 then error("Invalid UTF-8 character") end return 2 elseif c >= 224 and c <= 239 then -- UTF8-3 local c2 = s:byte(i + 1) local c3 = s:byte(i + 2) if not c2 or not c3 then error("UTF-8 string terminated early") end -- validate byte 2 if c == 224 and (c2 < 160 or c2 > 191) then error("Invalid UTF-8 character") elseif c == 237 and (c2 < 128 or c2 > 159) then error("Invalid UTF-8 character") elseif c2 < 128 or c2 > 191 then error("Invalid UTF-8 character") end -- validate byte 3 if c3 < 128 or c3 > 191 then error("Invalid UTF-8 character") end return 3 elseif c >= 240 and c <= 244 then -- UTF8-4 local c2 = s:byte(i + 1) local c3 = s:byte(i + 2) local c4 = s:byte(i + 3) if not c2 or not c3 or not c4 then error("UTF-8 string terminated early") end -- validate byte 2 if c == 240 and (c2 < 144 or c2 > 191) then error("Invalid UTF-8 character") elseif c == 244 and (c2 < 128 or c2 > 143) then error("Invalid UTF-8 character") elseif c2 < 128 or c2 > 191 then error("Invalid UTF-8 character") end -- validate byte 3 if c3 < 128 or c3 > 191 then error("Invalid UTF-8 character") end -- validate byte 4 if c4 < 128 or c4 > 191 then error("Invalid UTF-8 character") end return 4 else error("Invalid UTF-8 character") end end local function utf8len (s) -- argument checking if type(s) ~= "string" then error("bad argument #1 to "utf8len" (string expected, got ".. type(s).. ")") end local pos = 1 local bytes = s:len() local len = 0 while pos <= bytes and len ~= chars do local c = s:byte(pos) len = len + 1 pos = pos + utf8charbytes(s, pos) end if chars ~= nil then return pos - 1 end return len end print(utf8len("中国人"))
声明:该文观点仅代表作者本人,牛骨文系教育信息发布平台,牛骨文仅提供信息存储空间服务。
- 上一篇: PHP的COOKIE原理介绍与使用
- 下一篇: tp5中db::table和db::name的区别