Text search: various Kopt search fixes

- Properly parse input text for words (the previous
  code wasn't working with Greek letters)
- With multiple words search, don't allow "substring
  matching" for words in the middle
- Remove support for Lua pattens, so to get proper
  substring matching (as we have with cre text search)
This commit is contained in:
poire-z
2023-07-03 19:49:22 +02:00
parent dffc2404ca
commit a4720b44cd
2 changed files with 33 additions and 15 deletions

View File

@@ -13,6 +13,7 @@ local Geom = require("ui/geometry")
local KOPTContext = require("ffi/koptcontext")
local Persist = require("persist")
local TileCacheItem = require("document/tilecacheitem")
local Utf8Proc = require("ffi/utf8proc")
local logger = require("logger")
local util = require("util")
@@ -1337,18 +1338,22 @@ end
local function all_matches(boxes, pattern, caseInsensitive)
-- pattern list of single words
local plist = {}
-- split utf-8 characters
for words in pattern:gmatch("[\32-\127\192-\255]+[\128-\191]*") do
-- split space seperated words
for word in words:gmatch("[^%s]+") do
table.insert(plist, caseInsensitive and word:lower() or word)
-- (as in util.splitToWords(), but only splitting on spaces, keeping punctuations)
for word in util.gsplit(pattern, "%s+") do
if util.hasCJKChar(word) then
for char in util.gsplit(word, "[\192-\255][\128-\191]+", true) do
table.insert(plist, caseInsensitive and Utf8Proc.lowercase(util.fixUtf8(char, "?")) or char)
end
else
table.insert(plist, caseInsensitive and Utf8Proc.lowercase(util.fixUtf8(word, "?")) or word)
end
end
local pnb = #plist
-- return mached word indices from index i, j
local function match(i, j)
local pindex = 1
local matched_indices = {}
if #plist == 0 then return end
if pnb == 0 then return end
while true do
if #boxes[i] < j then
j = j - #boxes[i]
@@ -1356,10 +1361,27 @@ local function all_matches(boxes, pattern, caseInsensitive)
end
if i > #boxes then break end
local box = boxes[i][j]
local word = caseInsensitive and box.word:lower() or box.word
if word:match(plist[pindex]) then
local word = caseInsensitive and Utf8Proc.lowercase(util.fixUtf8(box.word, "?")) or box.word
local pword = plist[pindex]
local matched
if pnb == 1 then -- single word in plist
matched = word:find(pword, 1, true)
else -- multiple words in plist
if pindex == 1 then
-- first word of query should match at end of a word from the document
matched = word:sub(-#pword) == pword
elseif pindex == pnb then
-- last word of query should match at start of the word from the document
matched = word:sub(1, #pword) == pword
else
-- middle words in query should match exactly the word from the document
matched = word == pword
end
end
if matched then
table.insert(matched_indices, {i, j})
if pindex == #plist then
if pindex == pnb then
-- all words in plist iterated, all matched
return matched_indices
else
j = j + 1
@@ -1370,6 +1392,8 @@ local function all_matches(boxes, pattern, caseInsensitive)
end
end
end
-- Note that this returns a full word box, even if what matches
-- is only a substring of a word box.
return coroutine.wrap(function()
for i, line in ipairs(boxes) do
for j, box in ipairs(line) do