From b045df3ff3a64f02c04024944da452567cc44e49 Mon Sep 17 00:00:00 2001 From: weijiuqiao <59040746+weijiuqiao@users.noreply.github.com> Date: Wed, 1 Jan 2025 18:28:11 +0800 Subject: [PATCH] Vocabbuiler.koplugin: fix PDF context extraction with hyphenation (#12975) As pointed out at https://github.com/koreader/koreader/issues/12916#issuecomment-2564755827. --- frontend/document/koptinterface.lua | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/frontend/document/koptinterface.lua b/frontend/document/koptinterface.lua index 9f06c6826..7e2eb6164 100644 --- a/frontend/document/koptinterface.lua +++ b/frontend/document/koptinterface.lua @@ -1199,7 +1199,22 @@ function KoptInterface:getSelectedWordContext(word, nb_words, pos) local i_end, j_end = i, j local word_array = util.splitToArray(word, " ") for idx, split_word in ipairs(word_array) do - if boxes[i_end][j_end].word ~= split_word then return end + local box_word = boxes[i_end][j_end].word + if box_word:sub(-1) == "-" and j_end == #boxes[i_end] and box_word ~= split_word then + -- Line final hyphenation. + -- Combine word with first word of next line. + box_word = box_word:sub(1, -2) + i_end = i_end + 1 + j_end = 1 + box_word = box_word .. boxes[i_end][j_end].word + elseif box_word:sub(-2, -1) == "\u{00AD}" and j_end == #boxes[i_end] and box_word ~= split_word then + -- Hyphen + box_word = box_word:sub(1, -3) + i_end = i_end + 1 + j_end = 1 + box_word = box_word .. boxes[i_end][j_end].word + end + if box_word ~= split_word then return end if idx ~= #word_array then if j_end == #boxes[i_end] then i_end = i_end + 1