Text search: various Kopt search fixes

- Properly parse input text for words (the previous code wasn't working with Greek letters) - With multiple words search, don't allow "substring matching" for words in the middle - Remove support for Lua pattens, so to get proper substring matching (as we have with cre text search)
2025-08-10 00:52:38 +00:00 · 2023-07-03 19:49:22 +02:00
parent dffc2404ca
commit a4720b44cd
2 changed files with 33 additions and 15 deletions
--- a/frontend/document/koptinterface.lua
+++ b/frontend/document/koptinterface.lua
@@ -13,6 +13,7 @@ local Geom = require("ui/geometry")
 local KOPTContext = require("ffi/koptcontext")
 local Persist = require("persist")
 local TileCacheItem = require("document/tilecacheitem")
+local Utf8Proc = require("ffi/utf8proc")
 local logger = require("logger")
 local util = require("util")

@@ -1337,18 +1338,22 @@ end
 local function all_matches(boxes, pattern, caseInsensitive)
    -- pattern list of single words
    local plist = {}
-    -- split utf-8 characters
-    for words in pattern:gmatch("[\32-\127\192-\255]+[\128-\191]*") do
-        -- split space seperated words
-        for word in words:gmatch("[^%s]+") do
-            table.insert(plist, caseInsensitive and word:lower() or word)
+    -- (as in util.splitToWords(), but only splitting on spaces, keeping punctuations)
+    for word in util.gsplit(pattern, "%s+") do
+        if util.hasCJKChar(word) then
+            for char in util.gsplit(word, "[\192-\255][\128-\191]+", true) do
+                table.insert(plist, caseInsensitive and Utf8Proc.lowercase(util.fixUtf8(char, "?")) or char)
+            end
+        else
+            table.insert(plist, caseInsensitive and Utf8Proc.lowercase(util.fixUtf8(word, "?")) or word)
        end
    end
+    local pnb = #plist
    -- return mached word indices from index i, j
    local function match(i, j)
        local pindex = 1
        local matched_indices = {}
-        if #plist == 0 then return end
+        if pnb == 0 then return end
        while true do
            if #boxes[i] < j then
                j = j - #boxes[i]
@@ -1356,10 +1361,27 @@ local function all_matches(boxes, pattern, caseInsensitive)
            end
            if i > #boxes then break end
            local box = boxes[i][j]
-            local word = caseInsensitive and box.word:lower() or box.word
-            if word:match(plist[pindex]) then
+            local word = caseInsensitive and Utf8Proc.lowercase(util.fixUtf8(box.word, "?")) or box.word
+            local pword = plist[pindex]
+            local matched
+            if pnb == 1 then -- single word in plist
+                matched = word:find(pword, 1, true)
+            else -- multiple words in plist
+                if pindex == 1 then
+                    -- first word of query should match at end of a word from the document
+                   matched = word:sub(-#pword) == pword
+                elseif pindex == pnb then
+                    -- last word of query should match at start of the word from the document
+                    matched = word:sub(1, #pword) == pword
+                else
+                    -- middle words in query should match exactly the word from the document
+                    matched = word == pword
+                end
+            end
+            if matched then
                table.insert(matched_indices, {i, j})
-                if pindex == #plist then
+                if pindex == pnb then
+                    -- all words in plist iterated, all matched
                    return matched_indices
                else
                    j = j + 1
@@ -1370,6 +1392,8 @@ local function all_matches(boxes, pattern, caseInsensitive)
            end
        end
    end
+    -- Note that this returns a full word box, even if what matches
+    -- is only a substring of a word box.
    return coroutine.wrap(function()
        for i, line in ipairs(boxes) do
            for j, box in ipairs(line) do