mirror of
https://github.com/koreader/koreader.git
synced 2025-08-10 00:52:38 +00:00
Wikipedia Save as EPUB: various encoding fixes (#3851)
* Wiki Save as EPUB: various encoding fixes Fix display of & in article titles Fix display of &, <, > in TOC entries and in targeted anchor (the mismatch with the target id made these TOC entries invalid and simply not displayed). Remove percent-encoded URLs tweaks for crengine now that crengine correctly supports them (each percent encode handled as an UTF8 byte). Bump crengine for that. Don't include <meta name="cover"> when no cover present. * bump base/crengine
This commit is contained in:
@@ -529,15 +529,16 @@ function util.unicodeCodepointToUtf8(c)
|
||||
end
|
||||
end
|
||||
|
||||
-- we need to use an array of arrays to keep them ordered as written
|
||||
local HTML_ENTITIES_TO_UTF8 = {
|
||||
["<"] = "<",
|
||||
[">"] = ">",
|
||||
["""] = '"',
|
||||
["'"] = "'",
|
||||
[" "] = "\xC2\xA0",
|
||||
["&#(%d+);"] = function(x) return util.unicodeCodepointToUtf8(tonumber(x)) end,
|
||||
["&#x(%x+);"] = function(x) return util.unicodeCodepointToUtf8(tonumber(x,16)) end,
|
||||
["&"] = "&", -- must be last
|
||||
{"<", "<"},
|
||||
{">", ">"},
|
||||
{""", '"'},
|
||||
{"'", "'"},
|
||||
{" ", "\xC2\xA0"},
|
||||
{"&#(%d+);", function(x) return util.unicodeCodepointToUtf8(tonumber(x)) end},
|
||||
{"&#x(%x+);", function(x) return util.unicodeCodepointToUtf8(tonumber(x,16)) end},
|
||||
{"&", "&"}, -- must be last
|
||||
}
|
||||
--- Replace HTML entities with their UTF8 equivalent in text
|
||||
--
|
||||
@@ -546,8 +547,8 @@ local HTML_ENTITIES_TO_UTF8 = {
|
||||
--- @int string text with HTML entities
|
||||
--- @treturn string UTF8 text
|
||||
function util.htmlEntitiesToUtf8(text)
|
||||
for k,v in pairs(HTML_ENTITIES_TO_UTF8) do
|
||||
text = text:gsub(k, v)
|
||||
for _, t in ipairs(HTML_ENTITIES_TO_UTF8) do
|
||||
text = text:gsub(t[1], t[2])
|
||||
end
|
||||
return text
|
||||
end
|
||||
|
||||
Reference in New Issue
Block a user