mirror of
https://github.com/koreader/koreader.git
synced 2025-08-10 00:52:38 +00:00
split accient greek words with spacing character
This should fix #1705.
This commit is contained in:
@@ -19,6 +19,7 @@ local RenderText = require("ui/rendertext")
|
||||
local Screen = require("device").screen
|
||||
local Geom = require("ui/geometry")
|
||||
local util = require("util")
|
||||
local DEBUG= require("dbg")
|
||||
|
||||
local TextBoxWidget = Widget:new{
|
||||
text = nil,
|
||||
@@ -282,7 +283,7 @@ function TextBoxWidget:onHoldWord(callback, ges)
|
||||
local x, y = ges.pos.x - self.dimen.x, ges.pos.y - self.dimen.y
|
||||
local line_num = math.ceil(y / self.line_height_px)
|
||||
local line = self.vertical_string_list[line_num]
|
||||
|
||||
DEBUG("holding on line", line)
|
||||
if line then
|
||||
local char_start = line.offset
|
||||
local char_end -- char_end is non-inclusive
|
||||
@@ -304,10 +305,10 @@ function TextBoxWidget:onHoldWord(callback, ges)
|
||||
-- now find which word the character is in
|
||||
local words = util.splitToWords(line.text)
|
||||
local probe_idx = char_start
|
||||
for _,w in ipairs(words) do
|
||||
for _, w in ipairs(words) do
|
||||
-- +1 for word separtor
|
||||
probe_idx = probe_idx + string.len(w)
|
||||
if idx <= probe_idx then
|
||||
probe_idx = probe_idx + #util.splitToChars(w)
|
||||
if idx <= probe_idx - 1 then
|
||||
callback(w)
|
||||
return
|
||||
end
|
||||
|
||||
@@ -121,10 +121,15 @@ end
|
||||
---- @string text text to split
|
||||
---- @treturn table list of words, spaces and punctuations
|
||||
function util.splitToWords(text)
|
||||
-- TODO: write test
|
||||
local wlist = {}
|
||||
for words in text:gmatch("[\32-\127\192-\255]+[\128-\191]*") do
|
||||
for word in util.gsplit(words, "[%s%p]+", true) do
|
||||
for word in util.gsplit(text, "[%s%p]+", true) do
|
||||
-- if space splitted word contains CJK characters
|
||||
if word:match("[\228-\234][\128-\191]+") then
|
||||
-- split with CJK characters
|
||||
for char in util.gsplit(word, "[\228-\234\192-\255][\128-\191]+", true) do
|
||||
table.insert(wlist, char)
|
||||
end
|
||||
else
|
||||
table.insert(wlist, word)
|
||||
end
|
||||
end
|
||||
|
||||
@@ -52,4 +52,28 @@ describe("util module", function()
|
||||
"five",
|
||||
})
|
||||
end)
|
||||
|
||||
it("should split ancient greek words", function()
|
||||
local words = util.splitToWords("Λαρισαῖος Λευκοθέα Λιγυαστάδης.")
|
||||
assert.are_same(words, {
|
||||
"Λαρισαῖος",
|
||||
" ",
|
||||
"Λευκοθέα",
|
||||
" ",
|
||||
"Λιγυαστάδης",
|
||||
"."
|
||||
})
|
||||
end)
|
||||
|
||||
it("should split Chinese words", function()
|
||||
local words = util.splitToWords("彩虹是通过太阳光的折射引起的。")
|
||||
assert.are_same(words, {
|
||||
"彩","虹","是","通","过","太","阳","光","的","折","射","引","起","的","。",
|
||||
})
|
||||
end)
|
||||
|
||||
it("should split words of multilingual text", function()
|
||||
local words = util.splitToWords("BBC纪录片")
|
||||
assert.are_same(words, {"BBC", "纪", "录", "片"})
|
||||
end)
|
||||
end)
|
||||
|
||||
Reference in New Issue
Block a user