Fulltext search: all entries in entire document (#11313)

This commit is contained in:
hius07
2024-01-13 12:58:05 +02:00
committed by GitHub
parent 5f5162d95c
commit 0ceb88a9a3
7 changed files with 287 additions and 46 deletions

View File

@@ -1097,49 +1097,60 @@ function KoptInterface:getWordFromNativePosition(doc, boxes, pos)
return word_box
end
function KoptInterface:getSelectedWordContext(word, nb_words, pos)
local boxes = self.last_text_boxes
if not pos or not boxes or #boxes == 0 then return end
local i, j = getWordBoxIndices(boxes, pos)
if boxes[i][j].word ~= word then return end
local li, wi = i, j
local prev_count, next_count = 0, 0
local prev_text, next_text = {}, {}
local function get_prev_text(boxes, i, j, nb_words)
local prev_count = 0
local prev_text = {}
while prev_count < nb_words do
if li == 1 and wi == 1 then
if i == 1 and j == 1 then
break
elseif wi == 1 then
li = li - 1
wi = #boxes[li]
elseif j == 1 then
i = i - 1
j = #boxes[i]
else
wi = wi - 1
j = j - 1
end
local current_word = boxes[li][wi].word
local current_word = boxes[i][j].word
if #current_word > 0 then
table.insert(prev_text, 1, current_word)
prev_count = prev_count + 1
end
end
if #prev_text > 0 then
return table.concat(prev_text, " ")
end
end
li, wi = i, j
local function get_next_text(boxes, i, j, nb_words)
local next_count = 0
local next_text = {}
while next_count < nb_words do
if li == #boxes and wi == #boxes[li] then
if i == #boxes and j == #boxes[i] then
break
elseif wi == #boxes[li] then
li = li + 1
wi = 1
elseif j == #boxes[i] then
i = i + 1
j = 1
else
wi = wi + 1
j = j + 1
end
local current_word = boxes[li][wi].word
local current_word = boxes[i][j].word
if #current_word > 0 then
table.insert(next_text, current_word)
next_count = next_count + 1
end
end
if #prev_text == 0 and #next_text == 0 then return end
return table.concat(prev_text, " "), table.concat(next_text, " ")
if #next_text > 0 then
return table.concat(next_text, " ")
end
end
function KoptInterface:getSelectedWordContext(word, nb_words, pos)
local boxes = self.last_text_boxes
if not pos or not boxes or #boxes == 0 then return end
local i, j = getWordBoxIndices(boxes, pos)
if boxes[i][j].word ~= word then return end
local prev_text = get_prev_text(boxes, i, j, nb_words)
local next_text = get_next_text(boxes, i, j, nb_words)
return prev_text, next_text
end
--[[--
@@ -1336,19 +1347,23 @@ function KoptInterface:nativeToPageRectTransform(doc, pageno, rect)
end
end
local function all_matches(boxes, pattern, caseInsensitive)
local function get_pattern_list(pattern, case_insensitive)
-- pattern list of single words
local plist = {}
-- (as in util.splitToWords(), but only splitting on spaces, keeping punctuations)
for word in util.gsplit(pattern, "%s+") do
if util.hasCJKChar(word) then
for char in util.gsplit(word, "[\192-\255][\128-\191]+", true) do
table.insert(plist, caseInsensitive and Utf8Proc.lowercase(util.fixUtf8(char, "?")) or char)
table.insert(plist, case_insensitive and Utf8Proc.lowercase(util.fixUtf8(char, "?")) or char)
end
else
table.insert(plist, caseInsensitive and Utf8Proc.lowercase(util.fixUtf8(word, "?")) or word)
table.insert(plist, case_insensitive and Utf8Proc.lowercase(util.fixUtf8(word, "?")) or word)
end
end
return plist
end
local function all_matches(boxes, plist, case_insensitive)
local pnb = #plist
-- return mached word indices from index i, j
local function match(i, j)
@@ -1362,7 +1377,7 @@ local function all_matches(boxes, pattern, caseInsensitive)
end
if i > #boxes then break end
local box = boxes[i][j]
local word = caseInsensitive and Utf8Proc.lowercase(util.fixUtf8(box.word, "?")) or box.word
local word = case_insensitive and Utf8Proc.lowercase(util.fixUtf8(box.word, "?")) or box.word
local pword = plist[pindex]
local matched
if pnb == 1 then -- single word in plist
@@ -1407,11 +1422,12 @@ local function all_matches(boxes, pattern, caseInsensitive)
end)
end
function KoptInterface:findAllMatches(doc, pattern, caseInsensitive, page)
function KoptInterface:findAllMatches(doc, pattern, case_insensitive, page)
local text_boxes = doc:getPageTextBoxes(page)
if not text_boxes then return end
local plist = get_pattern_list(pattern, case_insensitive)
local matches = {}
for indices in all_matches(text_boxes or {}, pattern, caseInsensitive) do
for indices in all_matches(text_boxes, plist, case_insensitive) do
for _, index in ipairs(indices) do
local i, j = unpack(index)
local word = text_boxes[i][j]
@@ -1427,8 +1443,8 @@ function KoptInterface:findAllMatches(doc, pattern, caseInsensitive, page)
return matches
end
function KoptInterface:findText(doc, pattern, origin, reverse, caseInsensitive, pageno)
logger.dbg("Koptinterface: find text", pattern, origin, reverse, caseInsensitive, pageno)
function KoptInterface:findText(doc, pattern, origin, reverse, case_insensitive, pageno)
logger.dbg("Koptinterface: find text", pattern, origin, reverse, case_insensitive, pageno)
local last_pageno = doc:getPageCount()
local start_page, end_page
if reverse == 1 then
@@ -1456,7 +1472,7 @@ function KoptInterface:findText(doc, pattern, origin, reverse, caseInsensitive,
end
end
for i = start_page, end_page, (reverse == 1) and -1 or 1 do
local matches = self:findAllMatches(doc, pattern, caseInsensitive, i)
local matches = self:findAllMatches(doc, pattern, case_insensitive, i)
if #matches > 0 then
matches.page = i
return matches
@@ -1464,6 +1480,58 @@ function KoptInterface:findText(doc, pattern, origin, reverse, caseInsensitive,
end
end
function KoptInterface:findAllText(doc, pattern, case_insensitive, nb_context_words, max_hits)
local plist = get_pattern_list(pattern, case_insensitive)
local res = {}
for page = 1, doc:getPageCount() do
local text_boxes = doc:getPageTextBoxes(page)
if text_boxes then
for indices in all_matches(text_boxes, plist, case_insensitive) do -- each found pattern in the page
local res_item = { -- item of the Menu item_table
text = nil,
mandatory = page,
boxes = {}, -- to draw temp highlight in onMenuSelect
}
local text = {}
local i_prev, j_prev, i_next, j_next
for ind, index in ipairs(indices) do -- each word in the pattern
local i, j = unpack(index)
local word = text_boxes[i][j]
res_item.boxes[ind] = {
x = word.x0, y = word.y0,
w = word.x1 - word.x0,
h = word.y1 - word.y0,
}
text[ind] = word.word
if ind == 1 then
i_prev, j_prev = i, j
end
if ind == #indices then
i_next, j_next = i, j
end
end
text = "" .. table.concat(text, " ") .. ""
local prev_text = get_prev_text(text_boxes, i_prev, j_prev, nb_context_words)
if prev_text then
text = prev_text .. text
end
local next_text = get_next_text(text_boxes, i_next, j_next, nb_context_words)
if next_text then
text = text .. next_text
end
res_item.text = text
table.insert(res, res_item)
if #res == max_hits then
return res
end
end
end
end
if #res > 0 then
return res
end
end
--[[--
Log reflow duration.
--]]