mirror of
https://github.com/koreader/koreader.git
synced 2025-08-10 00:52:38 +00:00
Text search: various Kopt search fixes
- Properly parse input text for words (the previous code wasn't working with Greek letters) - With multiple words search, don't allow "substring matching" for words in the middle - Remove support for Lua pattens, so to get proper substring matching (as we have with cre text search)
This commit is contained in:
@@ -13,6 +13,7 @@ local Geom = require("ui/geometry")
|
||||
local KOPTContext = require("ffi/koptcontext")
|
||||
local Persist = require("persist")
|
||||
local TileCacheItem = require("document/tilecacheitem")
|
||||
local Utf8Proc = require("ffi/utf8proc")
|
||||
local logger = require("logger")
|
||||
local util = require("util")
|
||||
|
||||
@@ -1337,18 +1338,22 @@ end
|
||||
local function all_matches(boxes, pattern, caseInsensitive)
|
||||
-- pattern list of single words
|
||||
local plist = {}
|
||||
-- split utf-8 characters
|
||||
for words in pattern:gmatch("[\32-\127\192-\255]+[\128-\191]*") do
|
||||
-- split space seperated words
|
||||
for word in words:gmatch("[^%s]+") do
|
||||
table.insert(plist, caseInsensitive and word:lower() or word)
|
||||
-- (as in util.splitToWords(), but only splitting on spaces, keeping punctuations)
|
||||
for word in util.gsplit(pattern, "%s+") do
|
||||
if util.hasCJKChar(word) then
|
||||
for char in util.gsplit(word, "[\192-\255][\128-\191]+", true) do
|
||||
table.insert(plist, caseInsensitive and Utf8Proc.lowercase(util.fixUtf8(char, "?")) or char)
|
||||
end
|
||||
else
|
||||
table.insert(plist, caseInsensitive and Utf8Proc.lowercase(util.fixUtf8(word, "?")) or word)
|
||||
end
|
||||
end
|
||||
local pnb = #plist
|
||||
-- return mached word indices from index i, j
|
||||
local function match(i, j)
|
||||
local pindex = 1
|
||||
local matched_indices = {}
|
||||
if #plist == 0 then return end
|
||||
if pnb == 0 then return end
|
||||
while true do
|
||||
if #boxes[i] < j then
|
||||
j = j - #boxes[i]
|
||||
@@ -1356,10 +1361,27 @@ local function all_matches(boxes, pattern, caseInsensitive)
|
||||
end
|
||||
if i > #boxes then break end
|
||||
local box = boxes[i][j]
|
||||
local word = caseInsensitive and box.word:lower() or box.word
|
||||
if word:match(plist[pindex]) then
|
||||
local word = caseInsensitive and Utf8Proc.lowercase(util.fixUtf8(box.word, "?")) or box.word
|
||||
local pword = plist[pindex]
|
||||
local matched
|
||||
if pnb == 1 then -- single word in plist
|
||||
matched = word:find(pword, 1, true)
|
||||
else -- multiple words in plist
|
||||
if pindex == 1 then
|
||||
-- first word of query should match at end of a word from the document
|
||||
matched = word:sub(-#pword) == pword
|
||||
elseif pindex == pnb then
|
||||
-- last word of query should match at start of the word from the document
|
||||
matched = word:sub(1, #pword) == pword
|
||||
else
|
||||
-- middle words in query should match exactly the word from the document
|
||||
matched = word == pword
|
||||
end
|
||||
end
|
||||
if matched then
|
||||
table.insert(matched_indices, {i, j})
|
||||
if pindex == #plist then
|
||||
if pindex == pnb then
|
||||
-- all words in plist iterated, all matched
|
||||
return matched_indices
|
||||
else
|
||||
j = j + 1
|
||||
@@ -1370,6 +1392,8 @@ local function all_matches(boxes, pattern, caseInsensitive)
|
||||
end
|
||||
end
|
||||
end
|
||||
-- Note that this returns a full word box, even if what matches
|
||||
-- is only a substring of a word box.
|
||||
return coroutine.wrap(function()
|
||||
for i, line in ipairs(boxes) do
|
||||
for j, box in ipairs(line) do
|
||||
|
||||
Reference in New Issue
Block a user