mirror of
https://github.com/koreader/koreader.git
synced 2025-08-10 00:52:38 +00:00
Other minor frontend.util cleanups (#5629)
* Resync fixUtf8 w/ upstream * Fix lastIndexOf desc * Drop unichar usage, it's a crappier unicodeCodepointToUtf8 ;).
This commit is contained in:
@@ -9,7 +9,8 @@ local Screen = require("device").screen
|
||||
local UIManager = require("ui/uimanager")
|
||||
local lfs = require("libs/libkoreader-lfs")
|
||||
local logger = require("logger")
|
||||
local util = require("ffi/util")
|
||||
local FFIUtil = require("ffi/util")
|
||||
local util = require("util")
|
||||
local _ = require("gettext")
|
||||
local T = require("ffi/util").template
|
||||
|
||||
@@ -255,7 +256,7 @@ function Search:find(option)
|
||||
s=string.sub(s, n, string.len(s)-j)
|
||||
end
|
||||
|
||||
s=string.gsub(s, "\\u([a-f0-9][a-f0-9][a-f0-9][a-f0-9])", function(w) return util.unichar(tonumber(w, 16)) end)
|
||||
s=string.gsub(s, "\\u([a-f0-9][a-f0-9][a-f0-9][a-f0-9])", function(w) return util.unicodeCodepointToUtf8(tonumber(w, 16)) end)
|
||||
|
||||
return s
|
||||
end
|
||||
@@ -606,7 +607,7 @@ function Search:browse(option, run, chosen)
|
||||
if run == 1 then
|
||||
self.results = {}
|
||||
if option == "series" then
|
||||
for v,n in util.orderedPairs(self.browse_series) do
|
||||
for v,n in FFIUtil.orderedPairs(self.browse_series) do
|
||||
dummy = v
|
||||
if not SEARCH_CASESENSITIVE then dummy = string.upper(dummy) end
|
||||
if string.find(dummy, upsearch, nil, true) then
|
||||
@@ -619,7 +620,7 @@ function Search:browse(option, run, chosen)
|
||||
end
|
||||
end
|
||||
else
|
||||
for v,n in util.orderedPairs(self.browse_tags) do
|
||||
for v,n in FFIUtil.orderedPairs(self.browse_tags) do
|
||||
dummy = v
|
||||
if not SEARCH_CASESENSITIVE then dummy = string.upper(dummy) end
|
||||
if string.find(dummy, upsearch, nil, true) then
|
||||
|
||||
@@ -567,7 +567,7 @@ function ReaderDictionary:cleanSelection(text)
|
||||
-- with plain ascii quote (for french words like "aujourd’hui")
|
||||
text = text:gsub("\xE2\x80\x99", "'") -- U+2019 (right single quotation mark)
|
||||
-- Strip punctuation characters around selection
|
||||
text = util.stripePunctuations(text)
|
||||
text = util.stripPunctuation(text)
|
||||
-- Strip some common english grammatical construct
|
||||
text = text:gsub("'s$", '') -- english possessive
|
||||
-- Strip some common french grammatical constructs
|
||||
|
||||
@@ -1116,7 +1116,7 @@ function ReaderHighlight:onHighlightSearch()
|
||||
logger.dbg("search highlight")
|
||||
self:highlightFromHoldPos()
|
||||
if self.selected_text then
|
||||
local text = require("util").stripePunctuations(self.selected_text.text)
|
||||
local text = require("util").stripPunctuation(self.selected_text.text)
|
||||
self.ui:handleEvent(Event:new("ShowSearchDialog", text))
|
||||
end
|
||||
end
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
--]]
|
||||
|
||||
local BaseUtil = require("ffi/util")
|
||||
local util = require("util")
|
||||
local logger = require("logger")
|
||||
|
||||
-- Hangul Syllables
|
||||
@@ -82,8 +83,8 @@ end
|
||||
|
||||
|
||||
function HgSylbls:get_combined_char(initial, medial, final)
|
||||
-- utf8.char()
|
||||
return BaseUtil.unichar(HgSylbls:_get_combined_charcode(initial, medial, final))
|
||||
-- utf8.char() (i.e., encode)
|
||||
return util.unicodeCodepointToUtf8(HgSylbls:_get_combined_charcode(initial, medial, final))
|
||||
end
|
||||
function HgSylbls:_get_combined_charcode(initial, medial, final)
|
||||
local len_medial = #HgSylbls.CHARS_MEDIAL
|
||||
@@ -145,7 +146,7 @@ function HgSylbls:in_vowel_char(char)
|
||||
HgSylbls.UNI_HG_COMPAT_VOWEL_BASE, HgSylbls.UNI_HG_COMPAT_VOWEL_UPPER)
|
||||
end
|
||||
function HgSylbls:_in_target_char_group(char, base, upper, compat_base, compat_upper)
|
||||
local code = BaseUtil.utf8charcode(char) -- utf8.codepoint()
|
||||
local code = BaseUtil.utf8charcode(char) -- utf8.codepoint() (i.e., decode)
|
||||
|
||||
if code == nil then
|
||||
return false
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
https://github.com/Wiladams/LAPHLibs
|
||||
--]]
|
||||
local util = require("ffi/util")
|
||||
local util = require("util")
|
||||
local luxl = require("luxl")
|
||||
local ffi = require("ffi")
|
||||
|
||||
@@ -23,7 +23,7 @@ local function unescape(str)
|
||||
if unescape_map[s] then
|
||||
return unescape_map[s]
|
||||
elseif n == "#" then -- unescape unicode
|
||||
return util.unichar(tonumber(s))
|
||||
return util.unicodeCodepointToUtf8(tonumber(s))
|
||||
else
|
||||
return orig
|
||||
end
|
||||
|
||||
@@ -7,19 +7,20 @@ local dbg = require("dbg")
|
||||
local _ = require("gettext")
|
||||
local T = BaseUtil.template
|
||||
|
||||
local lshift = bit.lshift
|
||||
local rshift = bit.rshift
|
||||
local band = bit.band
|
||||
local bor = bit.bor
|
||||
|
||||
local util = {}
|
||||
|
||||
--- Strips all punctuation and spaces from a string.
|
||||
--- Strips all punctuation marks and spaces from a string.
|
||||
---- @string text the string to be stripped
|
||||
---- @treturn string stripped text
|
||||
function util.stripePunctuations(text)
|
||||
function util.stripPunctuation(text)
|
||||
if not text then return end
|
||||
-- strip ASCII punctuation characters around text
|
||||
-- and strip any generic punctuation (U+2000 - U+206F) in the text
|
||||
-- strip ASCII punctuation marks around text
|
||||
-- and strip any generic punctuation marks (U+2000 - U+206F) in the text
|
||||
return text:gsub("\226[\128-\131][\128-\191]", ''):gsub("^%p+", ''):gsub("%p+$", '')
|
||||
end
|
||||
|
||||
@@ -286,7 +287,7 @@ function util.tableMerge(t1, t2)
|
||||
end
|
||||
|
||||
--[[--
|
||||
Gets last index of string in character
|
||||
Gets last index of character in string (i.e., strrchr)
|
||||
|
||||
Returns the index within this string of the last occurrence of the specified character
|
||||
or -1 if the character does not occur.
|
||||
@@ -348,7 +349,7 @@ function util.splitToChars(text)
|
||||
hi_surrogate_uchar = uchar -- will be added if not followed by low surrogate
|
||||
elseif hi_surrogate and charcode and charcode >= 0xDC00 and charcode <= 0xDFFF then
|
||||
-- low surrogate following a high surrogate, good, let's make them a single char
|
||||
charcode = (hi_surrogate - 0xD800) * 0x400 + (charcode - 0xDC00) + 0x10000
|
||||
charcode = lshift((hi_surrogate - 0xD800), 10) + (charcode - 0xDC00) + 0x10000
|
||||
table.insert(tab, util.unicodeCodepointToUtf8(charcode))
|
||||
hi_surrogate = nil
|
||||
else
|
||||
@@ -379,13 +380,13 @@ function util.hasCJKChar(str)
|
||||
return string.match(str, "[\228-\234][\128-\191].") ~= nil
|
||||
end
|
||||
|
||||
--- Split texts into a list of words, spaces and punctuation.
|
||||
--- Split texts into a list of words, spaces and punctuation marks.
|
||||
---- @string text text to split
|
||||
---- @treturn table list of words, spaces and punctuation
|
||||
---- @treturn table list of words, spaces and punctuation marks
|
||||
function util.splitToWords(text)
|
||||
local wlist = {}
|
||||
for word in util.gsplit(text, "[%s%p]+", true) do
|
||||
-- if space splitted word contains CJK characters
|
||||
-- if space split word contains CJK characters
|
||||
if util.hasCJKChar(word) then
|
||||
-- split with CJK characters
|
||||
for char in util.gsplit(word, "[\228-\234\192-\255][\128-\191]+", true) do
|
||||
@@ -399,11 +400,11 @@ function util.splitToWords(text)
|
||||
end
|
||||
|
||||
-- We don't want to split on a space if it is followed by some
|
||||
-- specific punctuation : e.g. "word :" or "word )"
|
||||
-- (In french, there is a space before a colon, and it better
|
||||
-- specific punctuation marks : e.g. "word :" or "word )"
|
||||
-- (In French, there is a non-breaking space before a colon, and it better
|
||||
-- not be wrapped there.)
|
||||
local non_splittable_space_tailers = ":;,.!?)]}$%=-+*/|<>»”"
|
||||
-- Same if a space has some specific other punctuation before it
|
||||
-- Same if a space has some specific other punctuation mark before it
|
||||
local non_splittable_space_leaders = "([{$=-+*/|<>«“"
|
||||
|
||||
|
||||
@@ -460,20 +461,20 @@ function util.isSplittable(c, next_c, prev_c)
|
||||
return true
|
||||
end
|
||||
elseif c == " " then
|
||||
-- we only split on a space (so punctuation sticks to prev word)
|
||||
-- we only split on a space (so a punctuation mark sticks to prev word)
|
||||
-- if next_c or prev_c is provided, we can make a better decision
|
||||
if next_c and non_splittable_space_tailers:find(next_c, 1, true) then
|
||||
-- this space is followed by some punctuation that is better kept with us
|
||||
-- this space is followed by some punctuation mark that is better kept with us
|
||||
return false
|
||||
elseif prev_c and non_splittable_space_leaders:find(prev_c, 1, true) then
|
||||
-- this space is lead by some punctuation that is better kept with us
|
||||
-- this space is lead by some punctuation mark that is better kept with us
|
||||
return false
|
||||
else
|
||||
-- we can split on this space
|
||||
return true
|
||||
end
|
||||
end
|
||||
-- otherwise, non splittable
|
||||
-- otherwise, not splittable
|
||||
return false
|
||||
end
|
||||
|
||||
@@ -570,7 +571,7 @@ local function replaceSlashChar(str)
|
||||
end
|
||||
|
||||
--[[--
|
||||
Replaces characters that are invalid filenames.
|
||||
Replaces characters that are invalid in filenames.
|
||||
|
||||
Replaces the characters `\/:*?"<>|` with an `_` unless an optional path is provided. These characters are problematic on Windows filesystems. On Linux only the `/` poses a problem.
|
||||
|
||||
@@ -683,7 +684,7 @@ function util.getMenuText(item)
|
||||
text = item.text
|
||||
end
|
||||
if item.sub_item_table ~= nil or item.sub_item_table_func then
|
||||
text = text .. " \226\150\184"
|
||||
text = text .. " ▸"
|
||||
end
|
||||
return text
|
||||
end
|
||||
@@ -692,6 +693,8 @@ end
|
||||
Replaces invalid UTF-8 characters with a replacement string.
|
||||
|
||||
Based on <http://notebook.kulchenko.com/programming/fixing-malformed-utf8-in-lua>.
|
||||
c.f., FixUTF8 @ <https://github.com/pkulchenko/ZeroBraneStudio/blob/master/src/util.lua>.
|
||||
|
||||
@string str the string to be checked for invalid characters
|
||||
@string replacement the string to replace invalid characters with
|
||||
@treturn string valid UTF-8
|
||||
@@ -700,15 +703,15 @@ function util.fixUtf8(str, replacement)
|
||||
local pos = 1
|
||||
local len = #str
|
||||
while pos <= len do
|
||||
if pos == str:find("[%z\1-\127]", pos) then pos = pos + 1
|
||||
elseif pos == str:find("[\194-\223][\128-\191]", pos) then pos = pos + 2
|
||||
elseif pos == str:find( "\224[\160-\191][\128-\191]", pos)
|
||||
or pos == str:find("[\225-\236][\128-\191][\128-\191]", pos)
|
||||
or pos == str:find( "\237[\128-\159][\128-\191]", pos)
|
||||
or pos == str:find("[\238-\239][\128-\191][\128-\191]", pos) then pos = pos + 3
|
||||
elseif pos == str:find( "\240[\144-\191][\128-\191][\128-\191]", pos)
|
||||
or pos == str:find("[\241-\243][\128-\191][\128-\191][\128-\191]", pos)
|
||||
or pos == str:find( "\244[\128-\143][\128-\191][\128-\191]", pos) then pos = pos + 4
|
||||
if str:find("^[%z\1-\127]", pos) then pos = pos + 1
|
||||
elseif str:find("^[\194-\223][\128-\191]", pos) then pos = pos + 2
|
||||
elseif str:find( "^\224[\160-\191][\128-\191]", pos)
|
||||
or str:find("^[\225-\236][\128-\191][\128-\191]", pos)
|
||||
or str:find( "^\237[\128-\159][\128-\191]", pos)
|
||||
or str:find("^[\238-\239][\128-\191][\128-\191]", pos) then pos = pos + 3
|
||||
elseif str:find( "^\240[\144-\191][\128-\191][\128-\191]", pos)
|
||||
or str:find("^[\241-\243][\128-\191][\128-\191][\128-\191]", pos)
|
||||
or str:find( "^\244[\128-\143][\128-\191][\128-\191]", pos) then pos = pos + 4
|
||||
else
|
||||
str = str:sub(1, pos - 1) .. replacement .. str:sub(pos + 1)
|
||||
pos = pos + #replacement
|
||||
@@ -735,6 +738,7 @@ end
|
||||
--- Convert a Unicode codepoint (number) to UTF-8 char
|
||||
--- c.f., <https://stackoverflow.com/a/4609989>
|
||||
--- & <https://stackoverflow.com/a/38492214>
|
||||
--- See utf8charcode in ffi/util for a decoder.
|
||||
--
|
||||
--- @int c Unicode codepoint
|
||||
--- @treturn string UTF-8 char
|
||||
@@ -779,12 +783,12 @@ local HTML_ENTITIES_TO_UTF8 = {
|
||||
{"&", "&"}, -- must be last
|
||||
}
|
||||
--[[--
|
||||
Replace HTML entities with their UTF8 equivalent in text.
|
||||
Replace HTML entities with their UTF-8 encoded equivalent in text.
|
||||
|
||||
Supports only basic ones and those with numbers (no support for named entities like `é`).
|
||||
|
||||
@int string text with HTML entities
|
||||
@treturn string UTF8 text
|
||||
@treturn string UTF-8 text
|
||||
]]
|
||||
function util.htmlEntitiesToUtf8(text)
|
||||
for _, t in ipairs(HTML_ENTITIES_TO_UTF8) do
|
||||
@@ -834,7 +838,7 @@ function util.htmlToPlainTextIfHtml(text)
|
||||
is_html = true
|
||||
else
|
||||
-- no <tag> found
|
||||
-- but we may meet some text badly twicely encoded html containing "<br>"
|
||||
-- but we may meet some text badly/twice encoded html containing "<br>"
|
||||
local nb_encoded_tags
|
||||
_, nb_encoded_tags = text:gsub("<%a+>", "")
|
||||
if nb_encoded_tags > 0 then
|
||||
|
||||
@@ -6,14 +6,14 @@ describe("util module", function()
|
||||
util = require("util")
|
||||
end)
|
||||
|
||||
it("should strip punctuations around word", function()
|
||||
assert.is_equal("hello world", util.stripePunctuations("\"hello world\""))
|
||||
assert.is_equal("hello world", util.stripePunctuations("\"hello world?\""))
|
||||
assert.is_equal("hello, world", util.stripePunctuations("\"hello, world?\""))
|
||||
assert.is_equal("你好", util.stripePunctuations("“你好“"))
|
||||
assert.is_equal("你好", util.stripePunctuations("“你好?“"))
|
||||
assert.is_equal("", util.stripePunctuations(""))
|
||||
assert.is_nil(util.stripePunctuations(nil))
|
||||
it("should strip punctuation marks around word", function()
|
||||
assert.is_equal("hello world", util.stripPunctuation("\"hello world\""))
|
||||
assert.is_equal("hello world", util.stripPunctuation("\"hello world?\""))
|
||||
assert.is_equal("hello, world", util.stripPunctuation("\"hello, world?\""))
|
||||
assert.is_equal("你好", util.stripPunctuation("“你好“"))
|
||||
assert.is_equal("你好", util.stripPunctuation("“你好?“"))
|
||||
assert.is_equal("", util.stripPunctuation(""))
|
||||
assert.is_nil(util.stripPunctuation(nil))
|
||||
end)
|
||||
|
||||
describe("gsplit()", function()
|
||||
|
||||
Reference in New Issue
Block a user