diff --git a/base b/base index 243533e95..e8a24fe9b 160000 --- a/base +++ b/base @@ -1 +1 @@ -Subproject commit 243533e95ffb61b841b70aa4d50fd80df9cbfcaa +Subproject commit e8a24fe9b99b4c4ceb6c9329648e7a2f4d10bc0c diff --git a/frontend/apps/reader/modules/readerhighlight.lua b/frontend/apps/reader/modules/readerhighlight.lua index 780bf9ea7..e78546141 100644 --- a/frontend/apps/reader/modules/readerhighlight.lua +++ b/frontend/apps/reader/modules/readerhighlight.lua @@ -357,6 +357,7 @@ function ReaderHighlight:onHoldRelease() callback = function() UIManager:scheduleIn(0.1, function() self:lookupWikipedia() + self:onClose() end) end, }, diff --git a/frontend/apps/reader/modules/readerwikipedia.lua b/frontend/apps/reader/modules/readerwikipedia.lua index 9372269de..8b97473e7 100644 --- a/frontend/apps/reader/modules/readerwikipedia.lua +++ b/frontend/apps/reader/modules/readerwikipedia.lua @@ -101,7 +101,7 @@ function ReaderWikipedia:initLanguages(word) end end -function ReaderWikipedia:onLookupWikipedia(word, box, get_fullpage) +function ReaderWikipedia:onLookupWikipedia(word, box, get_fullpage, forced_lang) if not NetworkMgr:isOnline() then NetworkMgr:promptWifiOn() return @@ -109,8 +109,14 @@ function ReaderWikipedia:onLookupWikipedia(word, box, get_fullpage) -- word is the text to query. If get_fullpage is true, it is the -- exact wikipedia page title we want the full page of. self:initLanguages(word) - -- use first lang from self.wiki_languages, which may have been rotated by DictQuickLookup - local lang = self.wiki_languages[1] + local lang + if forced_lang then + -- use provided lang (from readerlink when noticing that an external link is a wikipedia url) + lang = forced_lang + else + -- use first lang from self.wiki_languages, which may have been rotated by DictQuickLookup + lang = self.wiki_languages[1] + end logger.dbg("lookup word:", word, box, get_fullpage) -- no need to clean word if get_fullpage, as it is the exact wikipetia page title if word and not get_fullpage then @@ -166,6 +172,7 @@ function ReaderWikipedia:onLookupWikipedia(word, box, get_fullpage) word = page.title, definition = definition, is_fullpage = get_fullpage, + lang = lang, } table.insert(results, result) end @@ -179,6 +186,7 @@ function ReaderWikipedia:onLookupWikipedia(word, box, get_fullpage) word = word, definition = self.no_page, is_fullpage = get_fullpage, + lang = lang, } } logger.dbg("dummy result table:", word, results) diff --git a/frontend/ui/widget/dictquicklookup.lua b/frontend/ui/widget/dictquicklookup.lua index 263e4d086..c3772b36b 100644 --- a/frontend/ui/widget/dictquicklookup.lua +++ b/frontend/ui/widget/dictquicklookup.lua @@ -19,6 +19,7 @@ local Device = require("device") local Geom = require("ui/geometry") local Event = require("ui/event") local Font = require("ui/font") +local util = require("util") local logger = require("logger") local _ = require("gettext") local T = require("ffi/util").template @@ -245,10 +246,67 @@ function DictQuickLookup:update() -- Different sets of buttons if fullpage or not local buttons if self.is_fullpage then - -- Only a single wide close button, get a little more room for - -- closing by taping at bottom (on footer or on this button) + -- A save and a close button buttons = { { + { + text = "Save as epub", + callback = function() + local InfoMessage = require("ui/widget/infomessage") + local ConfirmBox = require("ui/widget/confirmbox") + -- if forced_lang was specified, it may not be in our wiki_languages, + -- but ReaderWikipedia will have put it in result.lang + local lang = self.lang or self.wiki_languages_copy[1] + -- Just to be safe (none of the invalid chars, except ':' for uninteresting + -- Portal: or File: wikipedia pages, should be in lookup_word) + local cleaned_lookupword = util.replaceInvalidChars(self.lookupword) + local filename = cleaned_lookupword .. "."..string.upper(lang)..".epub" + -- Find a directory to save file into + local dir = G_reader_settings:readSetting("wikipedia_save_dir") + if not dir then dir = G_reader_settings:readSetting("download_dir") end -- OPDS dir + if not dir then dir = G_reader_settings:readSetting("home_dir") end + if not dir then dir = G_reader_settings:readSetting("lastdir") end + if not dir then + UIManager:show(InfoMessage:new{ + text = _("No directory to save page to !"), + }) + return + end + local epub_path = dir .. "/" .. filename + UIManager:show(ConfirmBox:new{ + text = T(_("Save as %1 ?"), filename), + ok_callback = function() + UIManager:scheduleIn(0.1, function() + local Wikipedia = require("ui/wikipedia") + Wikipedia:createEpubWithUI(epub_path, self.lookupword, lang, function(success) + if success then + UIManager:show(ConfirmBox:new{ + text = T(_("Page saved to:\n%1\n\nWould you like to read the downloaded page now?"), epub_path), + ok_callback = function() + -- close all dict/wiki windows, without scheduleIn(highlight.clear()) + self:onHoldClose(true) + -- close current ReaderUI in 1 sec, and create a new one + UIManager:scheduleIn(1.0, function() + local ReaderUI = require("apps/reader/readerui") + local reader = ReaderUI:_getRunningInstance() + if reader then + reader:onClose() + end + ReaderUI:showReader(epub_path) + end) + end, + }) + else + UIManager:show(InfoMessage:new{ + text = _("Failed saving Wikipedia page."), + }) + end + end) + end) + end + }) + end, + }, { text = "Close", callback = function() @@ -457,6 +515,7 @@ function DictQuickLookup:changeDictionary(index) self.lookupword = self.results[index].word self.definition = self.results[index].definition self.is_fullpage = self.results[index].is_fullpage + self.lang = self.results[index].lang if self.is_fullpage then self.displayword = self.lookupword else @@ -546,12 +605,12 @@ function DictQuickLookup:onClose() return true end -function DictQuickLookup:onHoldClose() +function DictQuickLookup:onHoldClose(no_clear) self:onClose() for i = #self.window_list, 1, -1 do local window = self.window_list[i] -- if one holds a highlight, let's clear it like in onClose() - if window.highlight then + if window.highlight and not no_clear then UIManager:scheduleIn(1, function() window.highlight:clear() end) diff --git a/frontend/ui/wikipedia.lua b/frontend/ui/wikipedia.lua index 7437a2a4e..498b36301 100644 --- a/frontend/ui/wikipedia.lua +++ b/frontend/ui/wikipedia.lua @@ -1,10 +1,18 @@ local JSON = require("json") local logger = require("logger") +local util = require("ffi/util") +local _ = require("gettext") +local T = require("ffi/util").template --[[ -- Query wikipedia using Wikimedia Web API. -- https://en.wikipedia.org/w/api.php?format=jsonfm&action=query&generator=search&gsrnamespace=0&gsrsearch=ereader&gsrlimit=10&prop=extracts&exintro&explaintext&exlimit=max -- https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=jsonfm&explaintext=&redirects=&titles=E-reader +-- +-- To get parsed HTML : +-- https://en.wikipedia.org/w/api.php?action=parse&page=E-book +-- https://en.wikipedia.org/w/api.php?action=parse&page=E-book&prop=text|sections|displaytitle|revid&disablelimitreport=&disableeditsection +-- https://www.mediawiki.org/wiki/API:Parsing_wikitext#parse --]] local Wikipedia = { @@ -37,16 +45,33 @@ local Wikipedia = { -- (otherwise, we get the full text for only the first result, and -- no text at all for the others }, + wiki_phtml_params = { + action = "parse", + format = "json", + -- we only need the following informations + prop = "text|sections|displaytitle|revid", + -- page = nil, -- text to lookup, will be added below + -- disabletoc = "", -- if we want to remove toc IN html + disablelimitreport = "", + disableeditsection = "", + }, + -- allow for disabling prettifying full page text + wiki_prettify = G_reader_settings:nilOrTrue("wikipedia_prettify"), } function Wikipedia:getWikiServer(lang) return string.format(self.wiki_server, lang or self.default_lang) end +-- Possible values for page_type parameter to loadPage() +local WIKIPEDIA_INTRO = 1 +local WIKIPEDIA_FULL = 2 +local WIKIPEDIA_PHTML = 3 + --[[ -- return decoded JSON table from Wikipedia --]] -function Wikipedia:loadPage(text, lang, intro, plain) +function Wikipedia:loadPage(text, lang, page_type, plain) local socket = require('socket') local url = require('socket.url') local http = require('socket.http') @@ -58,18 +83,25 @@ function Wikipedia:loadPage(text, lang, intro, plain) local parsed = url.parse(self:getWikiServer(lang)) parsed.path = self.wiki_path - if intro == true then -- search query + if page_type == WIKIPEDIA_INTRO then -- search query self.wiki_search_params.explaintext = plain and "" or nil for k,v in pairs(self.wiki_search_params) do - query = query .. k .. '=' .. v .. '&' + query = string.format("%s%s=%s&", query, k, v) end parsed.query = query .. "gsrsearch=" .. url.escape(text) - else -- full page content + elseif page_type == WIKIPEDIA_FULL then -- full page content self.wiki_params.explaintext = plain and "" or nil for k,v in pairs(self.wiki_params) do - query = query .. k .. '=' .. v .. '&' + query = string.format("%s%s=%s&", query, k, v) end parsed.query = query .. "titles=" .. url.escape(text) + elseif page_type == WIKIPEDIA_PHTML then -- parsed html page content + for k,v in pairs(self.wiki_phtml_params) do + query = string.format("%s%s=%s&", query, k, v) + end + parsed.query = query .. "page=" .. url.escape(text) + else + return end -- HTTP request @@ -107,7 +139,7 @@ end -- search wikipedia and get intros for results function Wikipedia:wikintro(text, lang) - local result = self:loadPage(text, lang, true, true) + local result = self:loadPage(text, lang, WIKIPEDIA_INTRO, true) if result then local query = result.query if query then @@ -118,14 +150,720 @@ end -- get full content of a wiki page function Wikipedia:wikifull(text, lang) - local result = self:loadPage(text, lang, false, true) + local result = self:loadPage(text, lang, WIKIPEDIA_FULL, true) if result then local query = result.query if query then + if self.wiki_prettify then + -- Prettification of the plain text full page + for pageid, page in pairs(query.pages) do + if page.extract then + page.extract = self:prettifyText(page.extract) + end + end + end return query.pages end end end +-- get parsed html content and other infos of a wiki page +function Wikipedia:wikiphtml(text, lang) + local result = self:loadPage(text, lang, WIKIPEDIA_PHTML, true) + if result and result.parse then + return result.parse + end + if result.error and result.error.info then + error(result.error.info) + end +end + +-- UTF8 of unicode geometrical shapes we can use to replace +-- the "=== title ===" of wkipedia plaintext pages +-- These chosen ones are available in most fonts (prettier symbols +-- exist in unicode, but are available in a few fonts only) and +-- have a quite consistent size/weight in all fonts. +local th1_sym = "\xE2\x96\x88" -- full block (big black rectangle) (never met, only for web page title?) +local th2_sym = "\xE2\x96\x89" -- big black square +local th3_sym = "\xC2\xA0\xE2\x97\x86" -- black diamond (indented, nicer) +local th4_sym = "\xE2\x97\xA4" -- black upper left triangle +local th5_sym = "\xE2\x9C\xBF" -- black florette +local th6_sym = "\xE2\x9D\x96" -- black diamond minus white x +-- Others available in most fonts +-- local thX_sym = "\xE2\x9C\x9A" -- heavy greek cross +-- local thX_sym = "\xE2\x97\xA2" -- black lower right triangle +-- local thX_sym = "\xE2\x97\x89" -- fish eye +-- local thX_sym = "\xE2\x96\x97" -- quadrant lower right + +-- For optional prettification of the plain text full page +function Wikipedia:prettifyText(text) + -- We use \a for an additional leading \n that we don't want shortened later + text = text:gsub("\n= ", "\n\a"..th1_sym.." ") -- 2 empty lines before + text = text:gsub("\n== ", "\n\a"..th2_sym.." ") -- 2 empty lines before + text = text:gsub("\n=== ", "\n"..th3_sym.." ") + text = text:gsub("\n==== ", "\n"..th4_sym.." ") + text = text:gsub("\n===== ", "\n"..th5_sym.." ") + text = text:gsub("\n====== ", "\n"..th6_sym.." ") + text = text:gsub("Modifier ==", " ==") -- fr wikipedia fix for some articles modified by clumsy editors + text = text:gsub("==$", "==\n") -- for a at end of text to be matched by next gsub + text = text:gsub(" ===?\n+", "\n\n") -- to : empty line after + text = text:gsub(" ====+\n+", "\n") -- to : single \n, no empty line + text = text:gsub("\n\n+\xE2\x80\x94", "\n\xE2\x80\x94") -- em dash, used for quote author, make it stick to prev text + text = text:gsub("\n +\n", "\n") -- trim lines full of only spaces (often seen in math formulas) + text = text:gsub("^\n*", "") -- trim new lines at start + text = text:gsub("\n*$", "") -- trim new lines at end + text = text:gsub("\n\n+", "\n\n") -- shorten multiple new lines + text = text:gsub("\a", "\n") -- re-add our wished \n + return text +end + + +local function getUrlContent(url, timeout) + local socket = require('socket') + local ltn12 = require('ltn12') + local requester + if url:sub(1,7) == "http://" then + requester = require('socket.http') + elseif url:sub(1,8) == "https://" then + requester = require('ssl.https') + else + return false, "Unsupported protocol" + end + requester.TIMEOUT = timeout or 10 + local request = {} + local sink = {} + request['url'] = url + request['method'] = 'GET' + request['sink'] = ltn12.sink.table(sink) + -- first argument returned by skip is code + local _, headers, status = socket.skip(1, requester.request(request)) + + if headers == nil then + logger.warn("No HTTP headers") + return false, "Network unavailable" + end + if status ~= "HTTP/1.1 200 OK" then + logger.warn("HTTP status not okay:", status) + return false, "Network unavailable" + end + + return true, table.concat(sink) +end + +-- UTF8 of unicode geometrical shapes we'll prepend to wikipedia section headers, +-- to help identifying hierarchy (othewise, the small font size differences helps). +-- Best if identical to the ones used above for prettifying full plain text page. +-- These chosen ones are available in most fonts (prettier symbols +-- exist in unicode, but are available in a few fonts only) and +-- have a quite consistent size/weight in all fonts. +local h1_sym = "\xE2\x96\x88" -- full block (big black rectangle) (never met, only for web page title?) +local h2_sym = "\xE2\x96\x89" -- big black square +local h3_sym = "\xE2\x97\x86" -- black diamond +local h4_sym = "\xE2\x97\xA4" -- black upper left triangle +local h5_sym = "\xE2\x9C\xBF" -- black florette +local h6_sym = "\xE2\x9D\x96" -- black diamond minus white x +-- Other available ones in most fonts +-- local hXsym = "\xE2\x9C\x9A" -- heavy greek cross +-- local hXsym = "\xE2\x97\xA2" -- black lower right triangle +-- local hXsym = "\xE2\x97\x89" -- fish eye +-- local hXsym = "\xE2\x96\x97" -- quadrant lower right + +local ext_to_mimetype = { + png = "image/png", + jpg = "image/jpeg", + jpeg = "image/jpeg", + gif = "image/gif", + svg = "image/svg+xml", + html= "application/xhtml+xml", + xhtml= "application/xhtml+xml", + ncx = "application/x-dtbncx+xml", + js = "text/javascript", + css = "text/css", + otf = "application/opentype", + ttf = "application/truetype", + woff = "application/font-woff", +} + + +-- Create an epub file (with possibly images) +-- This is non-UI code (for batch creation or emulator test), but it accepts +-- a progress_callback function that will be feed with progress information +-- that could be shown to the user. +function Wikipedia:createEpub(epub_path, page, lang, with_images, progress_callback) + if not progress_callback then + -- Make our own logging only process_callback + progress_callback = function(text, confirm) + logger.info("progress", confirm and "confirm" or "info", text) + return true -- always select "OK" in ConfirmBox + end + end + + progress_callback(_("Fetching Wikipedia page...")) + local ok, phtml = pcall(self.wikiphtml, self, page, lang) + if not ok then + progress_callback(phtml) + -- Sleep a bit to make that error seen + util.sleep(2) + progress_callback() -- close last progress info + return false + end + + -- Get infos from wikipedia result + -- (see example at https://en.wikipedia.org/w/api.php?action=parse&page=E-book&prop=text|sections|displaytitle|revid&disablelimitreport=&disableeditsection) + local cancelled = false + local html = phtml.text["*"] -- html content + local page_cleaned = page:gsub("_", " ") -- page title + local page_htmltitle = phtml.displaytitle -- page title with possible tags + local sections = phtml.sections -- Wikipedia provided TOC + local bookid = string.format("wikipedia_%s_%s_%s", lang, phtml.pageid, phtml.revid) + -- Not sure if this bookid may ever be used by indexing software/calibre, but if it is, + -- should it changes if content is updated (as now, including the wikipedia revisionId), + -- or should it stays the same even if revid changes (content of the same book updated). + + -- We need to find images in HTML to tell how many when asking user if they should be included + local images = {} + local seen_images = {} + local imagenum = 1 + local cover_imgid = "" -- best candidate for cover among our images + local processImg = function(img_tag) + local src = img_tag:match([[src="([^"]*)"]]) + if src == nil or src == "" then + logger.info("no src found in ", img_tag) + return nil + end + if src:sub(1,2) == "//" then + src = "https:" .. src -- Wikipedia redirects from http to https, so use https + end + local cur_image + if seen_images[src] then -- already seen + cur_image = seen_images[src] + else + local ext = src:match(".*%.(%S+)") + if ext == nil or ext == "" then -- we won't know what mimetype to use, ignore it + logger.info("no file extension found in ", src) + return nil + end + ext = ext:lower() + local imgid = string.format("img%05d", imagenum) + local imgpath = string.format("images/%s.%s", imgid, ext) + local mimetype = ext_to_mimetype[ext] or "" + local width = img_tag:match([[width="([^"]*)"]]) + local height = img_tag:match([[height="([^"]*)"]]) + -- Get higher resolution (2x) image url + local src2x = nil + local srcset = img_tag:match([[srcset="([^"]*)"]]) + if srcset then + srcset = " "..srcset.. ", " -- for next pattern to possibly match 1st or last item + src2x = srcset:match([[ (%S+) 2x, ]]) + if src2x and src2x:sub(1,2) == "//" then + src2x = "https:" .. src2x + end + end + cur_image = { + imgid = imgid, + imgpath = imgpath, + src = src, + src2x = src2x, + mimetype = mimetype, + width = width, + height = height, + } + table.insert(images, cur_image) + seen_images[src] = cur_image + -- Use first image of reasonable size (not an icon) and portrait-like as cover-image + if cover_imgid == "" and tonumber(width) > 50 and tonumber(height) > 50 and tonumber(height) > tonumber(width) then + cover_imgid = imgid + end + imagenum = imagenum + 1 + end + -- crengine will NOT use width and height attributes, but it will use + -- those found in a style attribute. + -- If we get src2x images, crengine will scale them down to the 1x image size + -- (less space wasted by images while reading), but the 2x quality will be + -- there when image is viewed full screen with ImageViewer widget. + return string.format([[]], cur_image.imgpath, cur_image.width, cur_image.height) + end + html = html:gsub("(<%s*img [^>]*>)", processImg) + logger.dbg("Images found in html:", images) + + -- See what to do with images + local include_images = false + local use_img_2x = false + if with_images then + -- if no progress_callback (non UI), our fake one will return true + if #images > 0 then + include_images = progress_callback(T(_("Page contains %1 images.\nWould you like to download and include them in epub ?"), #images), true) + if include_images then + use_img_2x = progress_callback(_("Would you like to get slightly higher quality images (but bigger file size) ?"), true) + end + else + progress_callback(_("Page contains no image.")) + util.sleep(1) -- Let the user see that + end + end + if not include_images then + -- Remove img tags to avoid little blank squares of missing images + html = html:gsub("<%s*img [^>]*>", "") + -- We could remove the whole image container
, + -- but it's a lot of nested
and not easy to do. + -- So the user will see the image legends and know a bit about + -- the images he chose to not get. + end + + -- Open the zip file (with .tmp for now, as crengine may still + -- have a handle to the final epub_path, and we don't want to + -- delete a good one if we fail/cancel later) + local epub_path_tmp = epub_path .. ".tmp" + local ZipWriter = require("ffi/zipwriter") + local epub = ZipWriter:new{} + if not epub:open(epub_path_tmp) then + return false + end + + -- We now create and add all the required epub files + + -- ---------------------------------------------------------------- + -- /mimetype : always "application/epub+zip" + epub:add("mimetype", "application/epub+zip") + + -- ---------------------------------------------------------------- + -- /META-INF/container.xml : always the same content + epub:add("META-INF/container.xml", [[ + + + + + +]]) + + -- ---------------------------------------------------------------- + -- OEBPS/content.opf : metadata + list of other files (paths relative to OEBPS/ directory) + -- Other possible items in this file that are of no interest to crengine : + -- In : + -- + -- + -- (crengine only uses to get the cover image) + -- In : + -- + -- And a section : + -- + -- + -- + -- + local koreader_version = "KOReader" + if lfs.attributes("git-rev", "mode") == "file" then + koreader_version = "KOReader "..io.open("git-rev", "r"):read() + end + local content_opf_parts = {} + -- head + table.insert(content_opf_parts, string.format([[ + + + + %s + Wikipedia %s + %s + %s + %s + + + + + + +]], page_cleaned, lang:upper(), bookid, lang, koreader_version, cover_imgid)) + -- images files + if include_images then + for inum, img in ipairs(images) do + table.insert(content_opf_parts, string.format([[ %s]], img.imgid, img.imgpath, img.mimetype, "\n")) + end + end + -- tail + table.insert(content_opf_parts, [[ + + + + + +]]) + epub:add("OEBPS/content.opf", table.concat(content_opf_parts)) + + -- ---------------------------------------------------------------- + -- OEBPS/stylesheet.css + -- crengine will use its own data/epub.css, we just add/fix a few styles + -- to look more alike wikipedia web pages (that the user can ignore + -- with "Embedded Style" off) + epub:add("OEBPS/stylesheet.css", [[ +/* make section headers looks left aligned and avoid some page breaks */ +h1, h2 { + text-align: left; +} +h3, h4, h5, h6, h7 { + page-break-before: avoid; + page-break-after: avoid; + text-align: left; +} +/* avoid page breaks around our centered titles on first page */ +h1.koreaderwikifrontpage, h5.koreaderwikifrontpage { + page-break-before: avoid; + page-break-inside: avoid; + page-break-after: avoid; + text-align: center; + margin-top: 0em; +} +p.koreaderwikifrontpage { + font-style: italic; + font-size: 90%; + margin-left: 2em; + margin-right: 2em; + margin-top: 1em; + margin-bottom: 1em; +} +hr.koreaderwikifrontpage { + margin-left: 20%; + margin-right: 20%; + margin-bottom: 1.2em; +} +/* So many links, make them look like normal text except for underline */ +a { + display:inline; + text-decoration: underline; + color: black, + font-weight: normal; +} +/* No underline for links without their href that we removed */ +a.newwikinonexistent { + text-decoration: none; +} +/* show a box around image thumbnails */ +div.thumb { + width: 80%; + border: dotted 1px black; + margin-top: 0.5em; + margin-bottom: 0.5em; + margin-left: 2.5em; + margin-right: 2.5em; + padding-top: ]].. (include_images and "0.5em" or "0.15em") .. [[; + padding-bottom: 0.2em; + padding-left: 0.5em; + padding-right: 0.5em; + text-align: center; + font-size: 90%; +} +/* don't waste left margin for notes and list of pages */ +ul, ol { + margin-left: 0em; +} +/* helps crengine to not display them as block elements */ +time, abbr, sup { + display: inline; +} +]]) + + -- ---------------------------------------------------------------- + -- OEBPS/toc.ncx : table of content + local toc_ncx_parts = {} + local depth = 0 + local cur_level = 0 + local np_end = [[]] + local num = 1 + -- Add our own first section for first page, with page name as title + table.insert(toc_ncx_parts, string.format([[%s]], num, num, page_cleaned)) + table.insert(toc_ncx_parts, np_end) + -- Wikipedia sections items seem to be already sorted by index, so no need to sort + for isec, s in ipairs(sections) do + num = num + 1 + local s_anchor = s.anchor + local s_title = string.format("%s %s", s.number, s.line) + s_title = (s_title:gsub("(%b<>)", "")) -- titles may include and other html tags + local s_level = s.toclevel + if s_level > depth then + depth = s_level -- max depth required in toc.ncx + end + if s_level == cur_level then + table.insert(toc_ncx_parts, np_end) -- close same-level previous navPoint + elseif s_level < cur_level then + table.insert(toc_ncx_parts, np_end) -- close same-level previous navPoint + while s_level < cur_level do -- close all in-between navPoint + table.insert(toc_ncx_parts, np_end) + cur_level = cur_level - 1 + end + elseif s_level > cur_level + 1 then + -- a jump from level N to level N+2 or more ... should not happen + -- per epub spec, but we don't know about wikipedia... + -- so we create missing intermediate navPoints with same anchor as current section + while s_level > cur_level + 1 do + table.insert(toc_ncx_parts, "\n"..(" "):rep(cur_level)) + table.insert(toc_ncx_parts, string.format([[-]], num, num, s_anchor)) + cur_level = cur_level + 1 + num = num + 1 + end + -- elseif s_level == cur_level + 1 then + -- sublevel, nothing to close, nothing to add + end + cur_level = s_level + table.insert(toc_ncx_parts, "\n"..(" "):rep(cur_level)) -- indentation, in case a person looks at it + table.insert(toc_ncx_parts, string.format([[%s]], num, num, s_title, s_anchor)) + end + -- close nested + while cur_level > 0 do + table.insert(toc_ncx_parts, np_end) + cur_level = cur_level - 1 + end + -- Prepend NCX head + table.insert(toc_ncx_parts, 1, string.format([[ + + + + + + + + + + + %s + + +]], bookid, depth, page_cleaned)) + -- Append NCX tail + table.insert(toc_ncx_parts, [[ + + +]]) + epub:add("OEBPS/toc.ncx", table.concat(toc_ncx_parts)) + + -- ---------------------------------------------------------------- + -- OEBPS/content.html + -- Some small fixes to Wikipedia HTML to make crengine and the user happier + + -- Most images are in a link to the image info page, which is a useless + -- external link for us, so let's remove this link. + html = html:gsub("]*>%s*(<%s*img [^>]*>)%s*", "%1") + + -- For some
, which include nested divs, although + -- perfectly balanced, crengine seems to miss some closing
and we + -- end up having our image bordered box including the remaining main wiki text. + -- It looks like this code is supposed to deal with class= containing multiple + -- class names : + -- https://github.com/koreader/crengine/commit/0930ec7230e720c148fd6f231d69558832b4d53a + -- and that it may stumble on some cases. + -- It's all perfectly fine if we make all these div with a single class name + -- html = html:gsub([[
]], [[
]]) + -- + -- But we may as well make all class= have a single name to avoid other problems + -- (no real risk with that, as we don't define any style for wikipedia class names, + -- except div.thumb that always appears first). + html = html:gsub([[(<[^>]* class="[^ "]+)%s+[^"]*"]], [[%1"]]) + + -- crengine seems to consider unknown tag as 'block' elements, so we may + -- want to remove or replace those that should be considered 'inline' elements + html = html:gsub("]*>", "") + + -- Fix internal wikipedia links with full server url (including lang) so + -- ReaderLink can notice them and deal with them with a LookupWikipedia event. + local wiki_base_url = self:getWikiServer(lang) + -- html = html:gsub([[href="/wiki/]], [[href="]]..wiki_base_url..[[/wiki/]]) + -- + -- Also, crengine deals strangely with percent encoded utf8 : + -- if the link in the html is : + -- we get from credocument:getLinkFromPosition() : http://fr.wikipedia.org/wiki/Françoix + -- These are bytes "\xc3\x83\xc2\xa7", that is U+C3 and U+A7 encoded as UTF8, + -- when we should have get "\xc3\xa7" ... + -- We can avoid that by putting in the url plain unencoded UTF8 + local hex_to_char = function(x) return string.char(tonumber(x, 16)) end + local fixEncodedWikiPageTitle = function(wiki_page) + wiki_page = wiki_page:gsub("%%(%x%x)", hex_to_char) + return string.format([[href="%s/wiki/%s"]], wiki_base_url, wiki_page) + end + html = html:gsub([[href="/wiki/([^"]*)"]], fixEncodedWikiPageTitle) + + -- Remove href from links to non existant wiki page so they are not clickable : + -- PageTitle©on + -- (removal of the href="" will make them non clickable) + html = html:gsub([[]* class="new"[^>]*>]], [[]]) + + -- Fix some other protocol-less links to wikipedia (href="//fr.wikipedia.org/w/index.php..) + html = html:gsub([[href="//]], [[href="https://]]) + + -- crengine does not return link if multiple class names in () + -- it would be no problem as we can't follow them, but when the user tap + -- on it, the tap is propagated to other widgets and page change happen... + -- html = html:gsub([[ (if it starts a line) or after (if it + -- ends a line or a block) by wrapping it with U+200B ZERO WIDTH SPACE which will + -- make the DOM tree walking code to find a link stop at it. + -- html = html:gsub("(<[aA])", "\xE2\x80\x8B%1") + -- html = html:gsub("()", "%1\xE2\x80\x8B") + -- Fixed in crengine lvtinydom. + + if self.wiki_prettify then + -- Prepend some symbols to section titles for a better visual feeling of hierarchy + html = html:gsub("

", "

"..h1_sym.." ") + html = html:gsub("

", "

"..h2_sym.." ") + html = html:gsub("

", "

"..h3_sym.." ") + html = html:gsub("

", "

"..h4_sym.." ") + html = html:gsub("

", "
"..h5_sym.." ") + html = html:gsub("
", "
"..h6_sym.." ") + end + + -- Note: in all the gsub patterns above, we used lowercase for tags and attributes + -- because it's how they are in wikipedia HTML and it makes the pattern simple. + -- If one day this changes, they'll have to be replaced with href => [Hh][Rr][Ee][Ff] ... + + -- We can finally build the final HTML with some header of our own + local saved_on = T(_("Saved on %1"), os.date("%b %d, %Y %H:%M:%S")) + local online_version_htmllink = string.format([[%s]], wiki_base_url, page:gsub(" ", "_"), _("online version")) + local see_online_version = T(_("See %1 for up-to-date content"), online_version_htmllink) + epub:add("OEBPS/content.html", string.format([[ + + + %s + + + +

%s

+
Wikipedia %s
+

%s
%s

+
+%s + + +]], page_cleaned, page_htmltitle, lang:upper(), saved_on, see_online_version, html)) + + -- ---------------------------------------------------------------- + -- OEBPS/images/* + if include_images then + local nb_images = #images + for inum, img in ipairs(images) do + progress_callback(T(_("Fetching image %1 / %2 ..."), inum, nb_images)) + local src = img.src + if use_img_2x and img.src2x then + src = img.src2x + end + logger.dbg("Getting img ", src) + local success, content = getUrlContent(src) + -- success, content = getUrlContent(src..".unexistant") -- to simulate failure + if success then + logger.dbg("success, size:", #content) + else + logger.info("failed fetching:", src) + end + if success then + epub:add("OEBPS/"..img.imgpath, content) + else + local go_on = progress_callback(T(_("Failed getting image %1, continue anyway ?"), inum), true) + if not go_on then + cancelled = true + break + end + end + end + end + + -- Done with adding files + if cancelled then + progress_callback(_("Cleaning up...")) + else + progress_callback(_("Packing epub...")) + end + epub:close() + -- This was nearly a no-op, so sleep a bit to make that progress step seen + util.usleep(300000) + progress_callback() -- close last progress info + + if cancelled then + -- Build was cancelled, remove half created .epub + if lfs.attributes(epub_path_tmp, "mode") == "file" then + os.remove(epub_path_tmp) + end + return false + end + + -- Finally move the .tmp to the final file + os.rename(epub_path_tmp, epub_path) + logger.info("successfully created:", epub_path) + return true +end + + +-- Wrapper to Wikipedia:createEpub() with UI progress info +function Wikipedia:createEpubWithUI(epub_path, page, lang, result_callback) + -- For progress_callback to be able to wait when needed + -- for user confirmation, we need to wrap Wikipedia:createEpub + -- in a coroutine, that can be resumed by these confirm callbacks. + local UIManager = require("ui/uimanager") + local InfoMessage = require("ui/widget/infomessage") + local ConfirmBox = require("ui/widget/confirmbox") + + -- Visual progress callback + local cur_progress_box = nil + local function ui_progress_callback(text, confirmbox) + if cur_progress_box then + -- close previous progress info + UIManager:close(cur_progress_box) + -- no repaint here, we'll do that below when new stuff is shown + end + if not text then + -- no text given, used to just close previous progress info when done + -- a repaint is needed + UIManager:forceRePaint() + return true + end + if confirmbox then + -- ConfirmBox requested: callbacks will resume coroutine + local _coroutine = coroutine.running() + cur_progress_box = ConfirmBox:new{ + text = text, + ok_callback = function() + coroutine.resume(_coroutine, true) + end, + cancel_callback = function() + coroutine.resume(_coroutine, false) + end, + } + else + -- simple InfoMessage requested + cur_progress_box = InfoMessage:new{text = text} + end + logger.dbg("Showing", confirmbox and "ConfirmBox" or "InfoMessage", text) + UIManager:show(cur_progress_box) + UIManager:forceRePaint() + if not confirmbox then + return true -- nothing more to do + end + -- we need to wait for ConfirmBox callback + logger.dbg("waiting for coroutine to resume") + if coroutine.running() then + local result = coroutine.yield() + logger.dbg(" coroutine ran and returned", result) + return result + end + end + + -- Coroutine wrapping Wikipedia:createEpub() + local co = coroutine.create(function() + -- If errors in Wikipedia:createEpub(), the coroutine + -- would just abort without crashing the reader, so + -- pcall would not be needed. But if that happens, + -- pcall will let us know and returns the error, + -- that we can log. + local ok, success = pcall(self.createEpub, self, epub_path, page, lang, true, ui_progress_callback) + if ok and success then + result_callback(true) + else + ui_progress_callback() -- close any last progress info not cleaned + logger.warn("Wikipedia.createEpub pcall:", ok, success) + result_callback(false) + end + end) + -- Execute coroutine + coroutine.resume(co) +end return Wikipedia