mirror of
https://github.com/koreader/koreader.git
synced 2025-08-10 00:52:38 +00:00
Merge pull request #2381 from Hzj-jie/master3
PR #2356 breaks CJK character splitting
This commit is contained in:
@@ -117,6 +117,16 @@ function util.splitToChars(text)
|
||||
return tab
|
||||
end
|
||||
|
||||
-- Test whether c is a CJK character
|
||||
function util.isCJKChar(c)
|
||||
return string.match(c, "[\228-\234][\128-\191].") == c
|
||||
end
|
||||
|
||||
-- Test whether str contains CJK characters
|
||||
function util.hasCJKChar(str)
|
||||
return string.match(str, "[\228-\234][\128-\191].") ~= nil
|
||||
end
|
||||
|
||||
--- Split text into a list of words, spaces and punctuations.
|
||||
---- @string text text to split
|
||||
---- @treturn table list of words, spaces and punctuations
|
||||
@@ -124,7 +134,7 @@ function util.splitToWords(text)
|
||||
local wlist = {}
|
||||
for word in util.gsplit(text, "[%s%p]+", true) do
|
||||
-- if space splitted word contains CJK characters
|
||||
if word:match("[\228-\234][\128-\191]+") then
|
||||
if util.hasCJKChar(word) then
|
||||
-- split with CJK characters
|
||||
for char in util.gsplit(word, "[\228-\234\192-\255][\128-\191]+", true) do
|
||||
table.insert(wlist, char)
|
||||
@@ -138,7 +148,7 @@ end
|
||||
|
||||
-- Test whether a string could be separated by a char for multi-line rendering
|
||||
function util.isSplitable(c)
|
||||
return c == " " or string.match(c, "%p") ~= nil
|
||||
return util.isCJKChar(c) or c == " " or string.match(c, "%p") ~= nil
|
||||
end
|
||||
|
||||
return util
|
||||
|
||||
@@ -106,4 +106,24 @@ describe("util module", function()
|
||||
})
|
||||
end)
|
||||
|
||||
it("should split text to line - CJK", function()
|
||||
local text = "彩虹是通过太阳光的折射引起的。"
|
||||
local word = ""
|
||||
local table_of_words = {}
|
||||
local c
|
||||
local table_chars = util.splitToChars(text)
|
||||
for i = 1, #table_chars do
|
||||
c = table_chars[i]
|
||||
word = word .. c
|
||||
if util.isSplitable(c) then
|
||||
table.insert(table_of_words, word)
|
||||
word = ""
|
||||
end
|
||||
if i == #table_chars then table.insert(table_of_words, word) end
|
||||
end
|
||||
assert.are_same(table_of_words, {
|
||||
"彩","虹","是","通","过","太","阳","光","的","折","射","引","起","的","。",
|
||||
})
|
||||
end)
|
||||
|
||||
end)
|
||||
|
||||
Reference in New Issue
Block a user