From aedab2d695c0ca05c9fa9ccc9343d084d54a13b6 Mon Sep 17 00:00:00 2001 From: poire-z Date: Fri, 1 Jan 2021 14:34:51 +0100 Subject: [PATCH] Dict/Wiki lookup: less text cleanup on manual input Don't cleanup input text as much when entered manually (or when it's sane) than when coming from book text selection. This may allow looking up words like "-suffix", or do more precise Wikipedia queries. --- .../apps/reader/modules/readerdictionary.lua | 58 ++++++++++--------- .../apps/reader/modules/readerhighlight.lua | 4 +- frontend/apps/reader/modules/readerlink.lua | 2 +- .../apps/reader/modules/readerwikipedia.lua | 16 ++--- frontend/ui/widget/dictquicklookup.lua | 12 ++-- 5 files changed, 51 insertions(+), 41 deletions(-) diff --git a/frontend/apps/reader/modules/readerdictionary.lua b/frontend/apps/reader/modules/readerdictionary.lua index ccdc1f907..ab214814c 100644 --- a/frontend/apps/reader/modules/readerdictionary.lua +++ b/frontend/apps/reader/modules/readerdictionary.lua @@ -214,7 +214,8 @@ function ReaderDictionary:addToMainMenu(menu_items) os.date("%Y-%m-%d %H:%M:%S", value.time), value.word, callback = function() - self:onLookupWord(value.word) + -- Word had been cleaned before being added to history + self:onLookupWord(value.word, true) end }) end @@ -385,10 +386,10 @@ function ReaderDictionary:addToMainMenu(menu_items) end end -function ReaderDictionary:onLookupWord(word, box, highlight, link) +function ReaderDictionary:onLookupWord(word, is_sane, box, highlight, link) logger.dbg("dict lookup word:", word, box) -- escape quotes and other funny characters in word - word = self:cleanSelection(word) + word = self:cleanSelection(word, is_sane) logger.dbg("dict stripped word:", word) self.highlight = highlight @@ -609,7 +610,7 @@ local function tidyMarkup(results) return results end -function ReaderDictionary:cleanSelection(text) +function ReaderDictionary:cleanSelection(text, is_sane) -- Will be used by ReaderWikipedia too if not text then return "" @@ -618,31 +619,33 @@ function ReaderDictionary:cleanSelection(text) -- some cleanup is still needed for selection we get from other engines -- (example: pdf selection "qu’autrefois," will be cleaned to "autrefois") -- + -- Replace no-break space with regular space + text = text:gsub("\xC2\xA0", ' ') -- U+00A0 no-break space -- Trim any space at start or end text = text:gsub("^%s+", "") text = text:gsub("%s+$", "") - -- Replace extended quote (included in the general puncturation range) - -- with plain ascii quote (for french words like "aujourd’hui") - text = text:gsub("\xE2\x80\x99", "'") -- U+2019 (right single quotation mark) - -- Strip punctuation characters around selection - text = util.stripPunctuation(text) - -- Strip some common english grammatical construct - text = text:gsub("'s$", '') -- english possessive - -- Strip some common french grammatical constructs - text = text:gsub("^[LSDMNTlsdmnt]'", '') -- french l' s' t'... - text = text:gsub("^[Qq][Uu]'", '') -- french qu' - -- Replace no-break space with regular space - text = text:gsub("\xC2\xA0", ' ') -- U+00A0 no-break space - -- There may be a need to remove some (all?) diacritical marks - -- https://en.wikipedia.org/wiki/Combining_character#Unicode_ranges - -- see discussion at https://github.com/koreader/koreader/issues/1649 - -- Commented for now, will have to be checked by people who read - -- languages and texts that use them. - -- text = text:gsub("\204[\128-\191]", '') -- U+0300 to U+033F - -- text = text:gsub("\205[\128-\175]", '') -- U+0340 to U+036F - -- Trim any space now at start or end after above changes - text = text:gsub("^%s+", "") - text = text:gsub("%s+$", "") + if not is_sane then + -- Replace extended quote (included in the general puncturation range) + -- with plain ascii quote (for french words like "aujourd’hui") + text = text:gsub("\xE2\x80\x99", "'") -- U+2019 (right single quotation mark) + -- Strip punctuation characters around selection + text = util.stripPunctuation(text) + -- Strip some common english grammatical construct + text = text:gsub("'s$", '') -- english possessive + -- Strip some common french grammatical constructs + text = text:gsub("^[LSDMNTlsdmnt]'", '') -- french l' s' t'... + text = text:gsub("^[Qq][Uu]'", '') -- french qu' + -- There may be a need to remove some (all?) diacritical marks + -- https://en.wikipedia.org/wiki/Combining_character#Unicode_ranges + -- see discussion at https://github.com/koreader/koreader/issues/1649 + -- Commented for now, will have to be checked by people who read + -- languages and texts that use them. + -- text = text:gsub("\204[\128-\191]", '') -- U+0300 to U+033F + -- text = text:gsub("\205[\128-\175]", '') -- U+0340 to U+036F + -- Trim any space now at start or end after above changes + text = text:gsub("^%s+", "") + text = text:gsub("%s+$", "") + end return text end @@ -680,7 +683,8 @@ function ReaderDictionary:onShowDictionaryLookup() is_enter_default = true, callback = function() UIManager:close(self.dictionary_lookup_dialog) - self:onLookupWord(self.dictionary_lookup_dialog:getInputText()) + -- Trust that input text does not need any cleaning (allows querying for "-suffix") + self:onLookupWord(self.dictionary_lookup_dialog:getInputText(), true) end, }, } diff --git a/frontend/apps/reader/modules/readerhighlight.lua b/frontend/apps/reader/modules/readerhighlight.lua index ba32f6f76..8739c6d5c 100644 --- a/frontend/apps/reader/modules/readerhighlight.lua +++ b/frontend/apps/reader/modules/readerhighlight.lua @@ -937,14 +937,14 @@ function ReaderHighlight:lookup(selected_word, selected_link) -- if we extracted text directly if selected_word.word then local word_box = self.view:pageToScreenTransform(self.hold_pos.page, selected_word.sbox) - self.ui:handleEvent(Event:new("LookupWord", selected_word.word, word_box, self, selected_link)) + self.ui:handleEvent(Event:new("LookupWord", selected_word.word, false, word_box, self, selected_link)) -- or we will do OCR elseif selected_word.sbox and self.hold_pos then local word = self.ui.document:getOCRWord(self.hold_pos.page, selected_word) logger.dbg("OCRed word:", word) if word and word ~= "" then local word_box = self.view:pageToScreenTransform(self.hold_pos.page, selected_word.sbox) - self.ui:handleEvent(Event:new("LookupWord", word, word_box, self, selected_link)) + self.ui:handleEvent(Event:new("LookupWord", word, false, word_box, self, selected_link)) else UIManager:show(InfoMessage:new{ text = info_message_ocr_text, diff --git a/frontend/apps/reader/modules/readerlink.lua b/frontend/apps/reader/modules/readerlink.lua index e08dceb4b..7b55216b0 100644 --- a/frontend/apps/reader/modules/readerlink.lua +++ b/frontend/apps/reader/modules/readerlink.lua @@ -711,7 +711,7 @@ function ReaderLink:onGoToExternalLink(link_url) callback = function() UIManager:nextTick(function() UIManager:close(dialog) - self.ui:handleEvent(Event:new("LookupWikipedia", wiki_page, false, true, wiki_lang)) + self.ui:handleEvent(Event:new("LookupWikipedia", wiki_page, true, false, true, wiki_lang)) end) end, }) diff --git a/frontend/apps/reader/modules/readerwikipedia.lua b/frontend/apps/reader/modules/readerwikipedia.lua index fd89d2ee8..7668748f9 100644 --- a/frontend/apps/reader/modules/readerwikipedia.lua +++ b/frontend/apps/reader/modules/readerwikipedia.lua @@ -53,7 +53,8 @@ function ReaderWikipedia:lookupInput() is_enter_default = true, callback = function() UIManager:close(self.input_dialog) - self:onLookupWikipedia(self.input_dialog:getInputText()) + -- Trust that input text does not need any cleaning (allows querying for "-suffix") + self:onLookupWikipedia(self.input_dialog:getInputText(), true) end, }, } @@ -98,7 +99,8 @@ function ReaderWikipedia:addToMainMenu(menu_items) os.date("%Y-%m-%d %H:%M:%S", value.time), text, callback = function() - self:onLookupWikipedia(value.word, nil, value.page, value.lang) + -- Word had been cleaned before being added to history + self:onLookupWikipedia(value.word, true, nil, value.page, value.lang) end }) end @@ -375,16 +377,16 @@ function ReaderWikipedia:initLanguages(word) end end -function ReaderWikipedia:onLookupWikipedia(word, box, get_fullpage, forced_lang) +function ReaderWikipedia:onLookupWikipedia(word, is_sane, box, get_fullpage, forced_lang) -- Wrapped through Trapper, as we may be using Trapper:dismissableRunInSubprocess() in it Trapper:wrap(function() - self:lookupWikipedia(word, box, get_fullpage, forced_lang) + self:lookupWikipedia(word, is_sane, box, get_fullpage, forced_lang) end) return true end -function ReaderWikipedia:lookupWikipedia(word, box, get_fullpage, forced_lang) - if NetworkMgr:willRerunWhenOnline(function() self:lookupWikipedia(word, box, get_fullpage, forced_lang) end) then +function ReaderWikipedia:lookupWikipedia(word, is_sane, box, get_fullpage, forced_lang) + if NetworkMgr:willRerunWhenOnline(function() self:lookupWikipedia(word, is_sane, box, get_fullpage, forced_lang) end) then -- Not online yet, nothing more to do here, NetworkMgr will forward the callback and run it once connected! return end @@ -404,7 +406,7 @@ function ReaderWikipedia:lookupWikipedia(word, box, get_fullpage, forced_lang) -- no need to clean word if get_fullpage, as it is the exact wikipetia page title if word and not get_fullpage then -- escape quotes and other funny characters in word - word = self:cleanSelection(word) + word = self:cleanSelection(word, is_sane) -- no need to lower() word with wikipedia search end logger.dbg("stripped word:", word) diff --git a/frontend/ui/widget/dictquicklookup.lua b/frontend/ui/widget/dictquicklookup.lua index ff4f17c92..c2450d222 100644 --- a/frontend/ui/widget/dictquicklookup.lua +++ b/frontend/ui/widget/dictquicklookup.lua @@ -1102,7 +1102,8 @@ function DictQuickLookup:inputLookup() else event = "LookupWord" end - self.ui:handleEvent(Event:new(event, word)) + -- Trust that input text does not need any cleaning (allows querying for "-suffix") + self.ui:handleEvent(Event:new(event, word, true)) end end @@ -1131,18 +1132,21 @@ end function DictQuickLookup:lookupWikipedia(get_fullpage) local word + local is_sane if get_fullpage then -- we use the word of the displayed result's definition, which -- is the exact title of the full wikipedia page word = self.lookupword + is_sane = true else -- we use the original word that was querried word = self.word + is_sane = false end self:resyncWikiLanguages() - -- strange : we need to pass false instead of nil if word_box is nil, - -- otherwise get_fullpage is not passed - self.ui:handleEvent(Event:new("LookupWikipedia", word, self.word_box and self.word_box or false, get_fullpage)) + -- (With Event, we need to pass false instead of nil if word_box is nil, + -- otherwise next arguments are discarded) + self.ui:handleEvent(Event:new("LookupWikipedia", word, is_sane, self.word_box and self.word_box or false, get_fullpage)) end return DictQuickLookup