mirror of
https://github.com/koreader/koreader.git
synced 2025-08-10 00:52:38 +00:00
Merge pull request #1194 from chrox/pdf_search
add fulltext search for PDF documents
This commit is contained in:
@@ -8,7 +8,7 @@ local _ = require("gettext")
|
||||
|
||||
local ReaderSearch = InputContainer:new{
|
||||
direction = 0, -- 0 for search forward, 1 for search backward
|
||||
case_insensitive = 1, -- default to case insensitive
|
||||
case_insensitive = true, -- default to case insensitive
|
||||
}
|
||||
|
||||
function ReaderSearch:init()
|
||||
@@ -33,7 +33,12 @@ function ReaderSearch:onShowSearchDialog(text)
|
||||
return function()
|
||||
local res = search_func(self, text, param)
|
||||
if res then
|
||||
self.ui.link:onGotoLink(res[1].start)
|
||||
if self.ui.document.info.has_pages then
|
||||
self.ui.link:onGotoLink({page = res.page - 1})
|
||||
self.view.highlight.temp[res.page] = res
|
||||
else
|
||||
self.ui.link:onGotoLink(res[1].start)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -73,7 +78,8 @@ end
|
||||
function ReaderSearch:search(pattern, origin)
|
||||
local direction = self.direction
|
||||
local case = self.case_insensitive
|
||||
return self.ui.document:findText(pattern, origin, direction, case)
|
||||
local page = self.view.state.page
|
||||
return self.ui.document:findText(pattern, origin, direction, case, page)
|
||||
end
|
||||
|
||||
function ReaderSearch:searchFromStart(pattern)
|
||||
|
||||
@@ -430,7 +430,7 @@ end
|
||||
|
||||
function CreDocument:findText(pattern, origin, reverse, caseInsensitive)
|
||||
DEBUG("CreDocument: find text", pattern, origin, reverse, caseInsensitive)
|
||||
return self._document:findText(pattern, origin, reverse, caseInsensitive)
|
||||
return self._document:findText(pattern, origin, reverse, caseInsensitive and 1 or 0)
|
||||
end
|
||||
|
||||
function CreDocument:register(registry)
|
||||
|
||||
@@ -105,6 +105,10 @@ function DjvuDocument:getCoverPageImage()
|
||||
return self.koptinterface:getCoverPageImage(self)
|
||||
end
|
||||
|
||||
function DjvuDocument:findText(pattern, origin, reverse, caseInsensitive, page)
|
||||
return self.koptinterface:findText(self, pattern, origin, reverse, caseInsensitive, page)
|
||||
end
|
||||
|
||||
function DjvuDocument:renderPage(pageno, rect, zoom, rotation, gamma, render_mode)
|
||||
return self.koptinterface:renderPage(self, pageno, rect, zoom, rotation, gamma, render_mode)
|
||||
end
|
||||
|
||||
@@ -934,11 +934,7 @@ end
|
||||
transform position in native page to reflowed page
|
||||
]]--
|
||||
function KoptInterface:nativeToReflowPosTransform(doc, pageno, pos)
|
||||
local bbox = doc:getPageBBox(pageno)
|
||||
local context_hash = self:getContextHash(doc, pageno, bbox)
|
||||
local kctx_hash = "kctx|"..context_hash
|
||||
local cached = Cache:check(kctx_hash)
|
||||
local kc = self:waitForContext(cached.kctx)
|
||||
local kc = self:getCachedContext(doc, pageno)
|
||||
--DEBUG("transform native pos", pos)
|
||||
local rpos = {}
|
||||
rpos.x, rpos.y = kc:nativeToReflowPosTransform(pos.x, pos.y)
|
||||
@@ -950,11 +946,7 @@ end
|
||||
transform position in reflowed page to native page
|
||||
]]--
|
||||
function KoptInterface:reflowToNativePosTransform(doc, pageno, abs_pos, rel_pos)
|
||||
local bbox = doc:getPageBBox(pageno)
|
||||
local context_hash = self:getContextHash(doc, pageno, bbox)
|
||||
local kctx_hash = "kctx|"..context_hash
|
||||
local cached = Cache:check(kctx_hash)
|
||||
local kc = self:waitForContext(cached.kctx)
|
||||
local kc = self:getCachedContext(doc, pageno)
|
||||
--kc:setDebug()
|
||||
--DEBUG("transform reflowed pos", abs_pos, rel_pos)
|
||||
local npos = {}
|
||||
@@ -1073,6 +1065,111 @@ function KoptInterface:nativeToPageRectTransform(doc, pageno, rect)
|
||||
end
|
||||
end
|
||||
|
||||
local function all_matches(boxes, pattern, caseInsensitive)
|
||||
-- pattern list of single words
|
||||
local plist = {}
|
||||
-- split utf-8 characters
|
||||
for words in pattern:gmatch("[\32-\127\192-\255]+[\128-\191]*") do
|
||||
-- split space seperated words
|
||||
for word in words:gmatch("[^%s]+") do
|
||||
table.insert(plist, caseInsensitive and word:lower() or word)
|
||||
end
|
||||
end
|
||||
-- return mached word indices from index i, j
|
||||
local function match(i, j)
|
||||
local pindex = 1
|
||||
local matched_indices = {}
|
||||
while true do
|
||||
if #boxes[i] < j then
|
||||
j = j - #boxes[i]
|
||||
i = i + 1
|
||||
end
|
||||
if i > #boxes then break end
|
||||
local box = boxes[i][j]
|
||||
local word = caseInsensitive and box.word:lower() or box.word
|
||||
if word:match(plist[pindex]) then
|
||||
table.insert(matched_indices, {i, j})
|
||||
if pindex == #plist then
|
||||
return matched_indices
|
||||
else
|
||||
j = j + 1
|
||||
pindex = pindex + 1
|
||||
end
|
||||
else
|
||||
break
|
||||
end
|
||||
end
|
||||
end
|
||||
return coroutine.wrap(function()
|
||||
for i, line in ipairs(boxes) do
|
||||
for j, box in ipairs(line) do
|
||||
local matches = match(i, j)
|
||||
if matches then
|
||||
coroutine.yield(matches)
|
||||
end
|
||||
end
|
||||
end
|
||||
end)
|
||||
end
|
||||
|
||||
function KoptInterface:findAllMatches(doc, pattern, caseInsensitive, page)
|
||||
local text_boxes = doc:getPageTextBoxes(page)
|
||||
if not text_boxes then return end
|
||||
--DEBUG("boxes", text_boxes)
|
||||
local matches = {}
|
||||
for indices in all_matches(text_boxes or {}, pattern, caseInsensitive) do
|
||||
for _, index in ipairs(indices) do
|
||||
local i, j = unpack(index)
|
||||
local word = text_boxes[i][j]
|
||||
local word_box = {
|
||||
x = word.x0, y = word.y0,
|
||||
w = word.x1 - word.x0,
|
||||
h = word.y1 - word.y0,
|
||||
}
|
||||
-- rects will be transformed to reflowed page rects if needed
|
||||
table.insert(matches, self:nativeToPageRectTransform(doc, page, word_box))
|
||||
end
|
||||
end
|
||||
return matches
|
||||
end
|
||||
|
||||
function KoptInterface:findText(doc, pattern, origin, reverse, caseInsensitive, pageno)
|
||||
DEBUG("Koptinterface: find text", pattern, origin, reverse, caseInsensitive, pageno)
|
||||
local last_pageno = doc:getPageCount()
|
||||
local start_page, end_page
|
||||
if reverse == 1 then
|
||||
-- backward
|
||||
if origin == 0 then
|
||||
-- from end of current page to first page
|
||||
start_page, end_page = pageno, 1
|
||||
elseif origin == -1 then
|
||||
-- from the last page to end of current page
|
||||
start_page, end_page = last_pageno, pageno + 1
|
||||
elseif origin == 1 then
|
||||
start_page, end_page = pageno - 1, 1
|
||||
end
|
||||
else
|
||||
-- forward
|
||||
if origin == 0 then
|
||||
-- from current page to the last page
|
||||
start_page, end_page = pageno, last_pageno
|
||||
elseif origin == -1 then
|
||||
-- from the first page to current page
|
||||
start_page, end_page = 1, pageno - 1
|
||||
elseif origin == 1 then
|
||||
-- from next page to the last page
|
||||
start_page, end_page = pageno + 1, last_pageno
|
||||
end
|
||||
end
|
||||
for i = start_page, end_page, (reverse == 1) and -1 or 1 do
|
||||
local matches = self:findAllMatches(doc, pattern, caseInsensitive, i)
|
||||
if #matches > 0 then
|
||||
matches.page = i
|
||||
return matches
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
--[[
|
||||
helper functions
|
||||
--]]
|
||||
|
||||
@@ -183,6 +183,10 @@ function PdfDocument:getCoverPageImage()
|
||||
return self.koptinterface:getCoverPageImage(self)
|
||||
end
|
||||
|
||||
function PdfDocument:findText(pattern, origin, reverse, caseInsensitive, page)
|
||||
return self.koptinterface:findText(self, pattern, origin, reverse, caseInsensitive, page)
|
||||
end
|
||||
|
||||
function PdfDocument:renderPage(pageno, rect, zoom, rotation, gamma, render_mode)
|
||||
return self.koptinterface:renderPage(self, pageno, rect, zoom, rotation, gamma, render_mode)
|
||||
end
|
||||
|
||||
@@ -4,6 +4,7 @@ local ReaderUI = require("apps/reader/readerui")
|
||||
local DEBUG = require("dbg")
|
||||
|
||||
local sample_epub = "spec/front/unit/data/juliet.epub"
|
||||
local sample_pdf = "spec/front/unit/data/sample.pdf"
|
||||
|
||||
describe("Readersearch module", function()
|
||||
describe("search API for EPUB documents", function()
|
||||
@@ -90,5 +91,104 @@ describe("Readersearch module", function()
|
||||
end)
|
||||
end)
|
||||
describe("search API for PDF documents", function()
|
||||
local doc, search, paging
|
||||
setup(function()
|
||||
local readerui = ReaderUI:new{
|
||||
document = DocumentRegistry:openDocument(sample_pdf),
|
||||
}
|
||||
doc = readerui.document
|
||||
search = readerui.search
|
||||
paging = readerui.paging
|
||||
end)
|
||||
it("should match single word with case insensitive option in one page", function()
|
||||
assert.are.equal(9, #doc.koptinterface:findAllMatches(doc, "what", true, 20))
|
||||
assert.are.equal(51, #doc.koptinterface:findAllMatches(doc, "the", true, 20))
|
||||
assert.are.equal(0, #doc.koptinterface:findAllMatches(doc, "xxxx", true, 20))
|
||||
end)
|
||||
it("should match single word with case sensitive option in one page", function()
|
||||
assert.are.equal(7, #doc.koptinterface:findAllMatches(doc, "what", false, 20))
|
||||
assert.are.equal(49, #doc.koptinterface:findAllMatches(doc, "the", false, 20))
|
||||
assert.are.equal(0, #doc.koptinterface:findAllMatches(doc, "xxxx", false, 20))
|
||||
end)
|
||||
it("should match phrase in one page", function()
|
||||
assert.are.equal(2*2, #doc.koptinterface:findAllMatches(doc, "mean that", true, 20))
|
||||
end)
|
||||
it("should match whole phrase in one page", function()
|
||||
assert.are.equal(1*3, #doc.koptinterface:findAllMatches(doc, "mean that the", true, 20))
|
||||
end)
|
||||
it("should match with lua pattern", function()
|
||||
assert.are.equal(7*1, #doc.koptinterface:findAllMatches(doc, "chapter", true, 30))
|
||||
assert.are.equal(3*2, #doc.koptinterface:findAllMatches(doc, "chapter %d", true, 30))
|
||||
assert.are.equal(2*2, #doc.koptinterface:findAllMatches(doc, "chapter %d%d", true, 30))
|
||||
assert.are.equal(0*2, #doc.koptinterface:findAllMatches(doc, "chapter %d%d%d", true, 30))
|
||||
end)
|
||||
it("should not match empty string", function()
|
||||
assert.are.equal(0, #doc.koptinterface:findAllMatches(doc, "", true, 1))
|
||||
end)
|
||||
it("should not match on page without text layer", function()
|
||||
assert.are.equal(0, #doc.koptinterface:findAllMatches(doc, "e", true, 1))
|
||||
end)
|
||||
it("should search backward", function()
|
||||
paging:gotoPage(20)
|
||||
assert.truthy(search:searchFromCurrent("test", 1))
|
||||
for i = 1, 40, 10 do
|
||||
paging:gotoPage(i)
|
||||
local words = search:searchFromCurrent("test", 1)
|
||||
if words then
|
||||
DEBUG("search backward: found at page", words.page)
|
||||
assert.truthy(words.page <= i)
|
||||
end
|
||||
end
|
||||
end)
|
||||
it("should search forward", function()
|
||||
paging:gotoPage(20)
|
||||
assert.truthy(search:searchFromCurrent("test", 0))
|
||||
for i = 1, 40, 10 do
|
||||
paging:gotoPage(i)
|
||||
local words = search:searchFromCurrent("test", 0)
|
||||
if words then
|
||||
DEBUG("search forward: found at page", words.page)
|
||||
assert.truthy(words.page >= i)
|
||||
end
|
||||
end
|
||||
end)
|
||||
it("should find the first occurrence", function()
|
||||
for i = 20, 40, 10 do
|
||||
paging:gotoPage(i)
|
||||
local words = search:searchFromStart("test")
|
||||
assert.truthy(words)
|
||||
assert.are.equal(10, words.page)
|
||||
end
|
||||
for i = 1, 10, 2 do
|
||||
paging:gotoPage(i)
|
||||
local words = search:searchFromStart("test")
|
||||
assert(words == nil)
|
||||
end
|
||||
end)
|
||||
it("should find the last occurrence", function()
|
||||
for i = 10, 30, 10 do
|
||||
paging:gotoPage(i)
|
||||
local words = search:searchFromEnd("test")
|
||||
assert.truthy(words)
|
||||
assert.are.equal(32, words.page)
|
||||
end
|
||||
for i = 40, 50, 2 do
|
||||
paging:gotoPage(i)
|
||||
local words = search:searchFromEnd("test")
|
||||
assert(words == nil)
|
||||
end
|
||||
end)
|
||||
it("should find all occurrences", function()
|
||||
local count = 0
|
||||
paging:gotoPage(1)
|
||||
local words = search:searchFromCurrent("test", 0)
|
||||
while words do
|
||||
count = count + #words
|
||||
--DEBUG("found words", #words, words.page)
|
||||
paging:gotoPage(words.page)
|
||||
words = search:searchNext("test", 0)
|
||||
end
|
||||
assert.are.equal(11, count)
|
||||
end)
|
||||
end)
|
||||
end)
|
||||
|
||||
Reference in New Issue
Block a user