Vocabbuiler.koplugin: fix PDF context extraction with hyphenation (#12975)

As pointed out at https://github.com/koreader/koreader/issues/12916#issuecomment-2564755827.
This commit is contained in:
weijiuqiao
2025-01-01 18:28:11 +08:00
committed by GitHub
parent a707b7c163
commit b045df3ff3

View File

@@ -1199,7 +1199,22 @@ function KoptInterface:getSelectedWordContext(word, nb_words, pos)
local i_end, j_end = i, j
local word_array = util.splitToArray(word, " ")
for idx, split_word in ipairs(word_array) do
if boxes[i_end][j_end].word ~= split_word then return end
local box_word = boxes[i_end][j_end].word
if box_word:sub(-1) == "-" and j_end == #boxes[i_end] and box_word ~= split_word then
-- Line final hyphenation.
-- Combine word with first word of next line.
box_word = box_word:sub(1, -2)
i_end = i_end + 1
j_end = 1
box_word = box_word .. boxes[i_end][j_end].word
elseif box_word:sub(-2, -1) == "\u{00AD}" and j_end == #boxes[i_end] and box_word ~= split_word then
-- Hyphen
box_word = box_word:sub(1, -3)
i_end = i_end + 1
j_end = 1
box_word = box_word .. boxes[i_end][j_end].word
end
if box_word ~= split_word then return end
if idx ~= #word_array then
if j_end == #boxes[i_end] then
i_end = i_end + 1