split accient greek words with spacing character

This should fix #1705.
2025-08-10 00:52:38 +00:00 · 2016-06-28 23:50:21 +08:00
parent ffd01b3345
commit 71bf9efc7c
3 changed files with 37 additions and 7 deletions
@@ -19,6 +19,7 @@ local RenderText = require("ui/rendertext")
 local Screen = require("device").screen
 local Geom = require("ui/geometry")
 local util = require("util")
+local DEBUG= require("dbg")

 local TextBoxWidget = Widget:new{
    text = nil,
@@ -282,7 +283,7 @@ function TextBoxWidget:onHoldWord(callback, ges)
    local x, y = ges.pos.x - self.dimen.x, ges.pos.y - self.dimen.y
    local line_num = math.ceil(y / self.line_height_px)
    local line = self.vertical_string_list[line_num]
-
+    DEBUG("holding on line", line)
    if line then
        local char_start = line.offset
        local char_end  -- char_end is non-inclusive
@@ -304,10 +305,10 @@ function TextBoxWidget:onHoldWord(callback, ges)
                -- now find which word the character is in
                local words = util.splitToWords(line.text)
                local probe_idx = char_start
-                for _,w in ipairs(words) do
+                for _, w in ipairs(words) do
                    -- +1 for word separtor
-                    probe_idx = probe_idx + string.len(w)
-                    if idx <= probe_idx then
+                    probe_idx = probe_idx + #util.splitToChars(w)
+                    if idx <= probe_idx - 1 then
                        callback(w)
                        return
                    end
@@ -121,10 +121,15 @@ end
 ---- @string text text to split
 ---- @treturn table list of words, spaces and punctuations
 function util.splitToWords(text)
-    -- TODO: write test
    local wlist = {}
-    for words in text:gmatch("[\32-\127\192-\255]+[\128-\191]*") do
-        for word in util.gsplit(words, "[%s%p]+", true) do
+    for word in util.gsplit(text, "[%s%p]+", true) do
+        -- if space splitted word contains CJK characters
+        if word:match("[\228-\234][\128-\191]+") then
+            -- split with CJK characters
+            for char in util.gsplit(word, "[\228-\234\192-\255][\128-\191]+", true) do
+                table.insert(wlist, char)
+            end
+        else
            table.insert(wlist, word)
        end
    end
@@ -52,4 +52,28 @@ describe("util module", function()
            "five",
        })
    end)
+
+    it("should split ancient greek words", function()
+        local words = util.splitToWords("Λαρισαῖος Λευκοθέα Λιγυαστάδης.")
+        assert.are_same(words, {
+            "Λαρισαῖος",
+            " ",
+            "Λευκοθέα",
+            " ",
+            "Λιγυαστάδης",
+            "."
+        })
+    end)
+
+    it("should split Chinese words", function()
+        local words = util.splitToWords("彩虹是通过太阳光的折射引起的。")
+        assert.are_same(words, {
+            "彩","虹","是","通","过","太","阳","光","的","折","射","引","起","的","。",
+        })
+    end)
+
+    it("should split words of multilingual text", function()
+        local words = util.splitToWords("BBC纪录片")
+        assert.are_same(words, {"BBC", "纪", "录", "片"})
+    end)
 end)