From 8a04dc985272417d01bcb3316a15d4e9bb64468d Mon Sep 17 00:00:00 2001 From: Scarlett <82218266+roygbyte@users.noreply.github.com> Date: Fri, 21 Jan 2022 14:23:21 -0400 Subject: [PATCH] NewsDownloader: new option to allow EPUB volumization (#8263) When this feature is enabled on a feed and that feed is synced, all new feed entries will be collected into a new single EPUB file. This is achieved by implementing a feed history feature (downloaded feeds are added as M5D hashes to a LuaSettings file), and by introducing additional methods into epubdownloader.lua that allow for multiple HTML documents to be added into single EPUB file. --- .../epubdownloadbackend.lua | 627 +++++++------- .../newsdownloader.koplugin/feed_source.lua | 398 +++++++++ plugins/newsdownloader.koplugin/feed_view.lua | 43 +- .../http_utilities.lua | 126 +++ plugins/newsdownloader.koplugin/main.lua | 797 +++++++----------- 5 files changed, 1163 insertions(+), 828 deletions(-) create mode 100644 plugins/newsdownloader.koplugin/feed_source.lua create mode 100644 plugins/newsdownloader.koplugin/http_utilities.lua diff --git a/plugins/newsdownloader.koplugin/epubdownloadbackend.lua b/plugins/newsdownloader.koplugin/epubdownloadbackend.lua index adfd93924..20663095f 100644 --- a/plugins/newsdownloader.koplugin/epubdownloadbackend.lua +++ b/plugins/newsdownloader.koplugin/epubdownloadbackend.lua @@ -1,15 +1,10 @@ +local NewsHelpers = require("http_utilities") local Version = require("version") -local ffiutil = require("ffi/util") -local http = require("socket.http") local logger = require("logger") -local ltn12 = require("ltn12") -local socket = require("socket") local socket_url = require("socket.url") -local socketutil = require("socketutil") local _ = require("gettext") -local T = ffiutil.template -local EpubDownloadBackend = { +local EpubBuilder = { -- Can be set so HTTP requests will be done under Trapper and -- be interruptible trap_widget = nil, @@ -17,8 +12,89 @@ local EpubDownloadBackend = { -- and error() with this code. We make the value of this error -- accessible here so that caller can know it's a user dismiss. dismissed_error_code = "Interrupted by user", + title = nil, + ncx_toc = nil, + ncx_manifest = nil, + ncx_contents = nil, + ncx_images = nil, } -local max_redirects = 5; --prevent infinite redirects + +function EpubBuilder:new(o) + o = o or {} + self.__index = self + setmetatable(o, self) + + return o +end + +function EpubBuilder:build(abs_output_path) + -- Open the zip file (with .tmp for now, as crengine may still + -- have a handle to the final epub_path, and we don't want to + -- delete a good one if we fail/cancel later) + local tmp_path = abs_output_path .. ".tmp" + local ZipWriter = require("ffi/zipwriter") + local epub = ZipWriter:new{} + + if not epub:open(tmp_path) then + logger.dbg("Failed to open tmp_path") + return false + end + + epub:add("mimetype", "application/epub+zip") + epub:add("META-INF/container.xml", [[ + + + + + +]]) + + -- Add the manifest. + if not self.ncx_manifest or #self.ncx_manifest == 0 then + error("EPUB does not contain a valid manifest.") + end + --logger.dbg("Adding Manifest:", self.ncx_manifest) + epub:add("OEBPS/content.opf", table.concat(self.ncx_manifest)) + + -- Add the table of contents. + if not self.ncx_toc or #self.ncx_toc == 0 then + error("EPUB does not contain a valid table of contents.") + end + --logger.dbg("Adding TOC:", self.ncx_toc) + epub:add("OEBPS/toc.ncx", table.concat(self.ncx_toc)) + + -- Add the contents. + if not self.ncx_contents or #self.ncx_manifest == 0 then + error("EPUB does not contain any content.") + end + --logger.dbg("Adding Content:", self.ncx_contents) + + for index, content in ipairs(self.ncx_contents) do + epub:add("OEBPS/" .. content.filename, content.html) + end + + -- Add the images. + --logger.dbg("Adding Images:", self.ncx_images) + if self.ncx_images then + for index, image in ipairs(self.ncx_images) do + epub:add( + "OEBPS/" .. image.path, + image.content, + image.no_compression + ) + end + end + + epub:close() + os.rename(tmp_path, abs_output_path) + + collectgarbage() + +end + +function EpubBuilder:release() + -- Stub for cleanup methods +end -- filter HTML using CSS selector local function filter(text, element) @@ -68,79 +144,9 @@ local function filter(text, element) return "" .. filtered .. "" end --- Get URL content -local function getUrlContent(url, timeout, maxtime, redirectCount) - logger.dbg("getUrlContent(", url, ",", timeout, ",", maxtime, ",", redirectCount, ")") - if not redirectCount then - redirectCount = 0 - elseif redirectCount == max_redirects then - error("EpubDownloadBackend: reached max redirects: ", redirectCount) - end - - if not timeout then timeout = 10 end - logger.dbg("timeout:", timeout) - - local sink = {} - local parsed = socket_url.parse(url) - socketutil:set_timeout(timeout, maxtime or 30) - local request = { - url = url, - method = "GET", - sink = maxtime and socketutil.table_sink(sink) or ltn12.sink.table(sink), - } - logger.dbg("request:", request) - local code, headers, status = socket.skip(1, http.request(request)) - socketutil:reset_timeout() - logger.dbg("After http.request") - local content = table.concat(sink) -- empty or content accumulated till now - logger.dbg("type(code):", type(code)) - logger.dbg("code:", code) - logger.dbg("headers:", headers) - logger.dbg("status:", status) - logger.dbg("#content:", #content) - - if code == socketutil.TIMEOUT_CODE or - code == socketutil.SSL_HANDSHAKE_CODE or - code == socketutil.SINK_TIMEOUT_CODE - then - logger.warn("request interrupted:", code) - return false, code - end - if headers == nil then - logger.warn("No HTTP headers:", code, status) - return false, "Network or remote server unavailable" - end - if not code or string.sub(code, 1, 1) ~= "2" then -- all 200..299 HTTP codes are OK - if code and code > 299 and code < 400 and headers and headers.location then -- handle 301, 302... - local redirected_url = headers.location - local parsed_redirect_location = socket_url.parse(redirected_url) - if not parsed_redirect_location.host then - parsed_redirect_location.host = parsed.host - parsed_redirect_location.scheme = parsed.scheme - redirected_url = socket_url.build(parsed_redirect_location) - end - logger.dbg("getUrlContent: Redirecting to url: ", redirected_url) - return getUrlContent(redirected_url, timeout, maxtime, redirectCount + 1) - else - error("EpubDownloadBackend: Don't know how to handle HTTP response status: ", status) - end - logger.warn("HTTP status not okay:", code, status) - return false, "Remote server error or unavailable" - end - if headers and headers["content-length"] then - -- Check we really got the announced content size - local content_length = tonumber(headers["content-length"]) - if #content ~= content_length then - return false, "Incomplete content received" - end - end - logger.dbg("Returning content ok") - return true, content -end - -function EpubDownloadBackend:getResponseAsString(url) - logger.dbg("EpubDownloadBackend:getResponseAsString(", url, ")") - local success, content = getUrlContent(url) +function EpubBuilder:getResponseAsString(url) + logger.dbg("EpubBuilder:getResponseAsString(", url, ")") + local success, content = NewsHelpers:getUrlContent(url) if (success) then return content else @@ -148,38 +154,14 @@ function EpubDownloadBackend:getResponseAsString(url) end end -function EpubDownloadBackend:setTrapWidget(trap_widget) +function EpubBuilder:setTrapWidget(trap_widget) self.trap_widget = trap_widget end -function EpubDownloadBackend:resetTrapWidget() +function EpubBuilder:resetTrapWidget() self.trap_widget = nil end -function EpubDownloadBackend:loadPage(url) - local completed, success, content - if self.trap_widget then -- if previously set with EpubDownloadBackend:setTrapWidget() - local Trapper = require("ui/trapper") - local timeout, maxtime = 30, 60 - -- We use dismissableRunInSubprocess with complex return values: - completed, success, content = Trapper:dismissableRunInSubprocess(function() - return getUrlContent(url, timeout, maxtime) - end, self.trap_widget) - if not completed then - error(self.dismissed_error_code) -- "Interrupted by user" - end - else - local timeout, maxtime = 10, 60 - success, content = getUrlContent(url, timeout, maxtime) - end - logger.dbg("success:", success, "type(content):", type(content), "content:", content:sub(1, 500), "...") - if not success then - error(content) - else - return content - end -end - local ext_to_mimetype = { png = "image/png", jpg = "image/jpeg", @@ -195,29 +177,15 @@ local ext_to_mimetype = { ttf = "application/truetype", woff = "application/font-woff", } --- Create an epub file (with possibly images) -function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, message, filter_enable, filter_element) - logger.dbg("EpubDownloadBackend:createEpub(", epub_path, ")") - -- Use Trapper to display progress and ask questions through the UI. - -- We need to have been Trapper.wrap()'ed for UI to be used, otherwise - -- Trapper:info() and Trapper:confirm() will just use logger. - local UI = require("ui/trapper") - -- We may need to build absolute urls for non-absolute links and images urls +-- GetPublishableHtml +function EpubBuilder:getImagesAndHtml(html, url, include_images, filter_enable, filter_element) local base_url = socket_url.parse(url) - - local cancelled = false - local page_htmltitle = html:match([[(.*)]]) - logger.dbg("page_htmltitle is ", page_htmltitle) --- local sections = html.sections -- Wikipedia provided TOC - local bookid = "bookid_placeholder" --string.format("wikipedia_%s_%s_%s", lang, phtml.pageid, phtml.revid) - -- Not sure if this bookid may ever be used by indexing software/calibre, but if it is, - -- should it changes if content is updated (as now, including the wikipedia revisionId), - -- or should it stays the same even if revid changes (content of the same book updated). - if filter_enable then html = filter(html, filter_element) end local images = {} local seen_images = {} local imagenum = 1 local cover_imgid = nil -- best candidate for cover among our images + html = filter_enable and filter(html, filter_element) or html + local processImg = function(img_tag) local src = img_tag:match([[src="([^"]*)"]]) if src == nil or src == "" then @@ -272,13 +240,20 @@ function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, me width = width, height = height, } - table.insert(images, cur_image) + seen_images[src] = cur_image -- Use first image of reasonable size (not an icon) and portrait-like as cover-image if not cover_imgid and width and width > 50 and height and height > 50 and height > width then logger.dbg("Found a suitable cover image") cover_imgid = imgid + cur_image["cover_image"] = true end + + table.insert( + images, + cur_image + ) + imagenum = imagenum + 1 end -- crengine will NOT use width and height attributes, but it will use @@ -296,130 +271,53 @@ function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, me local style = table.concat(style_props, "; ") return string.format([[]], cur_image.imgpath, style) end - html = html:gsub("(<%s*img [^>]*>)", processImg) - logger.dbg("Images found in html:", images) - -- See what to do with images - local use_img_2x = false - if not include_images then + if include_images then + html = html:gsub("(<%s*img [^>]*>)", processImg) + else -- Remove img tags to avoid little blank squares of missing images html = html:gsub("<%s*img [^>]*>", "") -- We could remove the whole image container
, -- but it's a lot of nested
and not easy to do. -- So the user will see the image legends and know a bit about - -- the images he chose to not get. + -- the images they chose to not get. end - UI:info(T(_("%1\n\nBuilding EPUB…"), message)) - -- Open the zip file (with .tmp for now, as crengine may still - -- have a handle to the final epub_path, and we don't want to - -- delete a good one if we fail/cancel later) - local epub_path_tmp = epub_path .. ".tmp" - local ZipWriter = require("ffi/zipwriter") - local epub = ZipWriter:new{} - if not epub:open(epub_path_tmp) then - logger.dbg("Failed to open epub_path_tmp") - return false - end + -- Force a GC to free the memory we used (the second call may help + -- reclaim more memory). + collectgarbage() + collectgarbage() + return images, html +end - -- We now create and add all the required epub files +function EpubBuilder:setTitle(title) + self.title = title +end - -- ---------------------------------------------------------------- - -- /mimetype : always "application/epub+zip" - epub:add("mimetype", "application/epub+zip") - -- ---------------------------------------------------------------- - -- /META-INF/container.xml : always the same content - epub:add("META-INF/container.xml", [[ - - - - - -]]) - logger.dbg("Added META-INF/container.xml") - - -- ---------------------------------------------------------------- - -- OEBPS/content.opf : metadata + list of other files (paths relative to OEBPS/ directory) - -- Other possible items in this file that are of no interest to crengine : - -- In : - -- - -- - -- (crengine only uses to get the cover image) - -- In : - -- - -- And a section : - -- - -- - -- - -- - local content_opf_parts = {} - -- head - local meta_cover = "" - if include_images and cover_imgid then - meta_cover = string.format([[]], cover_imgid) - end - logger.dbg("meta_cover:", meta_cover) - table.insert(content_opf_parts, string.format([[ - - - - %s - KOReader %s - %s - - - - - -]], page_htmltitle, Version:getCurrentRevision(), meta_cover)) - -- images files - if include_images then - for inum, img in ipairs(images) do - table.insert(content_opf_parts, string.format([[ %s]], img.imgid, img.imgpath, img.mimetype, "\n")) - end - end - -- tail - table.insert(content_opf_parts, [[ - - - - - -]]) - epub:add("OEBPS/content.opf", table.concat(content_opf_parts)) - logger.dbg("Added OEBPS/content.opf") - - -- ---------------------------------------------------------------- - -- OEBPS/stylesheet.css - --- @todo We told it we'd include a stylesheet.css, so it's probably best - -- that we do. In theory, we could try to fetch any *.css files linked in - -- the main html. - epub:add("OEBPS/stylesheet.css", [[ -/* Empty */ -]]) - logger.dbg("Added OEBPS/stylesheet.css") - - -- ---------------------------------------------------------------- - -- OEBPS/toc.ncx : table of content +function EpubBuilder:addToc(chapters) local toc_ncx_parts = {} local depth = 0 - local cur_level = 0 - local np_end = [[]] - local num = 1 - -- Add our own first section for first page, with page name as title - table.insert(toc_ncx_parts, string.format([[%s]], num, num, page_htmltitle)) - table.insert(toc_ncx_parts, np_end) - --- @todo Not essential for most articles, but longer articles might benefit - -- from parsing tags and constructing a proper TOC - while cur_level > 0 do - table.insert(toc_ncx_parts, np_end) - cur_level = cur_level - 1 + local num = 0 + + for index, chapter in ipairs(chapters) do + -- Add nav part for each chapter. + table.insert( + toc_ncx_parts, + string.format([[%s]], + num, + num, + chapter.title, + chapter.md5 + ) + ) + num = num + 1 end - -- Prepend NCX head - table.insert(toc_ncx_parts, 1, string.format([[ + -- Prepend NCX head. + table.insert( + toc_ncx_parts, + 1, + string.format([[ @@ -433,99 +331,172 @@ function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, me %s -]], bookid, depth, page_htmltitle)) - -- Append NCX tail - table.insert(toc_ncx_parts, [[ +]], +"placeholder_bookid", +depth, +self.title + ) + ) + -- Append NCX tail. + table.insert( + toc_ncx_parts, + [[ -]]) - epub:add("OEBPS/toc.ncx", table.concat(toc_ncx_parts)) - logger.dbg("Added OEBPS/toc.ncx") - - -- ---------------------------------------------------------------- - -- OEBPS/content.html - epub:add("OEBPS/content.html", html) - logger.dbg("Added OEBPS/content.html") - - -- Force a GC to free the memory we used till now (the second call may - -- help reclaim more memory). - collectgarbage() - collectgarbage() - - -- ---------------------------------------------------------------- - -- OEBPS/images/* - if include_images then - local nb_images = #images - for inum, img in ipairs(images) do - -- Process can be interrupted at this point between each image download - -- by tapping while the InfoMessage is displayed - -- We use the fast_refresh option from image #2 for a quicker download - local go_on = UI:info(T(_("%1\n\nRetrieving image %2 / %3 …"), message, inum, nb_images), inum >= 2) - if not go_on then - logger.dbg("cancelled") - cancelled = true - break - end - local src = img.src - if use_img_2x and img.src2x then - src = img.src2x - end - logger.dbg("Getting img ", src) - local success, content = getUrlContent(src) - -- success, content = getUrlContent(src..".unexistant") -- to simulate failure - if success then - logger.dbg("success, size:", #content) - else - logger.dbg("failed fetching:", src) - end - if success then - -- Images do not need to be compressed, so spare some cpu cycles - local no_compression = true - if img.mimetype == "image/svg+xml" then -- except for SVG images (which are XML text) - no_compression = false - end - epub:add("OEBPS/"..img.imgpath, content, no_compression) - logger.dbg("Adding OEBPS/"..img.imgpath) - else - go_on = UI:confirm(T(_("Downloading image %1 failed. Continue anyway?"), inum), _("Stop"), _("Continue")) - if not go_on then - cancelled = true - break - end - end - end - end - - -- Done with adding files - if cancelled then - if UI:confirm(_("Download did not complete.\nDo you want to create an EPUB with the already downloaded images?"), _("Don't create"), _("Create")) then - cancelled = false - end - end - if cancelled then - UI:info(_("Canceled. Cleaning up…")) - else - UI:info(T(_("%1\n\nPacking EPUB…"), message)) - end - epub:close() - - if cancelled then - -- Build was cancelled, remove half created .epub - if lfs.attributes(epub_path_tmp, "mode") == "file" then - os.remove(epub_path_tmp) - end - return false - end - - -- Finally move the .tmp to the final file - os.rename(epub_path_tmp, epub_path) - logger.dbg("successfully created:", epub_path) - - -- Force a GC to free the memory we used (the second call may help - -- reclaim more memory). - collectgarbage() - collectgarbage() - return true +]] + ) + self.ncx_toc = toc_ncx_parts end -return EpubDownloadBackend +function EpubBuilder:addManifest(chapters, images) + local content_opf_parts = {} + local spine_parts = {} + local meta_cover = "" + + if #images > 0 then + for inum, image in ipairs(images) do + table.insert( + content_opf_parts, + string.format([[%s]], + image.imgid, + image.imgpath, + image.mimetype, + "\n" + ) + ) + -- See if the image has the tag we previously set indicating + -- it can be used as a cover image. + if image.cover_image then + meta_cover = string.format([[]], image.imgid) + end + end + end + + if #chapters > 0 then + for index, chapter in ipairs(chapters) do + table.insert( + content_opf_parts, + string.format([[%s]], + chapter.md5, + chapter.md5, + "\n" + ) + ) + table.insert( + spine_parts, + string.format([[%s]], + chapter.md5, + "\n" + ) + ) + end + end + + logger.dbg("meta_cover:", meta_cover) + + table.insert( + content_opf_parts, + 1, + string.format([[ + + + %s + KOReader %s + %s + + + +]], self.title, Version:getCurrentRevision(), meta_cover) + ) + -- tail + table.insert( + content_opf_parts, + string.format([[ + + +%s + + +]], table.concat(spine_parts) + ) + ) + + self.ncx_manifest = content_opf_parts +end + +function EpubBuilder:addContents(chapters) + local contents = {} + + for index, chapter in ipairs(chapters) do + table.insert( + contents, + { + filename = chapter.md5 .. ".html", + html = chapter.html, + } + ) + end + + self.ncx_contents = contents +end + +function EpubBuilder:addImages(images) + local images_table = {} + + for index, image in ipairs(images) do + if not image.src then + return + end + + local src = image.src + local success, content = NewsHelpers:getUrlContent(src) + -- success, content = NewsHelpers:getUrlContent(src..".unexistant") -- to simulate failure + if success then + logger.dbg("EpubBuilder:addImages = success, size:", #content) + else + logger.dbg("EpubBuilder:addImages = failure fetching:", src) + end + + if success then + -- Images do not need to be compressed, so spare some cpu cycles + local no_compression = true + if image.mimetype == "image/svg+xml" then -- except for SVG images (which are XML text) + no_compression = false + end + table.insert( + images_table, + { + path = image.imgpath, + content = content, + compression = no_compression + } + ) + end + end + + self.ncx_images = images_table + +end + +-- There can be multiple links. +-- For now we just assume the first link is probably the right one. +--- @todo Write unit tests. +-- Some feeds that can be used for unit test. +-- http://fransdejonge.com/feed/ for multiple links. +-- https://github.com/koreader/koreader/commits/master.atom for single link with attributes. +function EpubBuilder:getFeedLink(possible_link) + local E = {} + logger.dbg("Possible link", possible_link) + if type(possible_link) == "string" then + return possible_link + elseif (possible_link._attr or E).href then + return possible_link._attr.href + elseif ((possible_link[1] or E)._attr or E).href then + return possible_link[1]._attr.href + end +end + + +return EpubBuilder diff --git a/plugins/newsdownloader.koplugin/feed_source.lua b/plugins/newsdownloader.koplugin/feed_source.lua new file mode 100644 index 000000000..2b95bb6fc --- /dev/null +++ b/plugins/newsdownloader.koplugin/feed_source.lua @@ -0,0 +1,398 @@ +local BD = require("ui/bidi") +local DownloadBackend = require("epubdownloadbackend") +local NewsHelpers = require("http_utilities") +local dateparser = require("lib.dateparser") +local logger = require("logger") +local md5 = require("ffi/sha2").md5 +local util = require("util") +local _ = require("gettext") +local N_ = _.ngettext +local FFIUtil = require("ffi/util") +local T = FFIUtil.template + +local FeedSource = { + file_extension = ".epub" +} + +function FeedSource:new(o) + o = o or {} + self.__index = self + setmetatable(o, self) + return o +end + +function FeedSource:getInitializedFeeds(feed_list, progress_callback, error_callback) + local initialized_feeds = {} + local unsupported_feeds_urls = {} + + for idx, feed in ipairs(feed_list) do + local url = feed[1] + -- Show a UI update + progress_callback(T( + _("Setting up feed %1 of %2."), + idx, + url + )) + -- Initialize the feed + local ok, response = pcall(function() + return self:initializeDocument( + self:fetchDocumentByUrl(url) + ) + end) + -- If the initialization worked, add the feed + -- to a list of initialized feeds + if ok and response then + table.insert(initialized_feeds, { + config = feed, + document = response, + }) + else + table.insert(unsupported_feeds_urls, { + url .. ": " .. response + }) + end + end + + if #unsupported_feeds_urls > 0 then + -- When some errors are present, we get a sour message that includes + -- information about the source of the error. + local unsupported_urls = "" + for key, value in pairs(unsupported_feeds_urls) do + -- Create the error message. + -- unsupported_urls = unsupported_urls .. " " .. value[1] .. " " .. value[2] + unsupported_urls = value[1] .. "\n\n" + -- Not sure what this does. + if key ~= #unsupported_feeds_urls then + unsupported_urls = BD.url(unsupported_urls) .. ", " + end + end + error_callback( + T(N_("Could not initialize a feed:\n\n%2\n\nPlease review your feed configuration.", "Could not initialize %1 feeds:\n\n%2\n\nPlease review your feed configurations.", #unsupported_feeds_urls), + #unsupported_feeds_urls, unsupported_urls) + ) + end + + return initialized_feeds +end + +-- This function contacts the feed website and attempts to get +-- the RSS/Atom document with a list of the latest items. +function FeedSource:fetchDocumentByUrl(url) + local document + -- Get the XML document representing the feed + local ok, response = pcall(function() + local success, content = NewsHelpers:getUrlContent(url) + if (success) then + return content + else + error("Failed to download content for url: " .. url, 0) + end + end) + -- Check to see if a response is available to deserialize. + if ok then + -- Deserialize the XML document into something Lua can use + document = NewsHelpers:deserializeXMLString(response) + end + -- Return the document or any errors that may have occured + if ok or document then + return document + else + if not ok then + error("(Reason: Failed to download feed document)", 0) + else + error("(Reason: Error during feed document deserialization)", 0) + end + end +end + +-- Supply this method with the XML document returned by the feed, +-- and it will initialized the document by extracting the feed title, +-- feed items, and items count. +function FeedSource:initializeDocument(document) + local feed_title + local feed_items + local total_items + + local ok = pcall(function() + return self:getFeedType( + document, + function() + -- RSS callback + feed_title = util.htmlEntitiesToUtf8(document.rss.channel.title) + feed_items = document.rss.channel.item + total_items = #document.rss.channel.item + end, + function() + -- Atom callback + feed_title = FeedSource:getFeedTitle(document.feed.title) + feed_items = document.feed.entry + total_items = #document.feed.entry + end + ) + end) + + if ok then + document.title = feed_title + document.items = feed_items + document.total_items = total_items + return document + else + error(_("Could not initialize feed document"), 0) + end +end + +function FeedSource:getItemsContent(feed, progress_callback, error_callback) + local limit = tonumber(feed.config.limit) + local total_items = (limit == 0) and + feed.document.total_items or + limit + local initialized_feed_items = {} + -- Download each ite0m in the feed + for index, item in pairs(feed.document.items) do + -- If limit has been met, stop downloading feed. + if limit ~= 0 and index - 1 == limit then + break + end + -- Display feedback to user. + progress_callback(T( + _("%3\n Downloading item %1 of %2"), + index, + total_items, + feed.document.title + )) + -- Download the article's HTML. + local ok, response = pcall(function() + return self:initializeItemHtml( + feed, + self:getItemHtml( + item, + feed.config.download_full_article + ) + ) + end) + + -- Add the result to our table, or send a + -- result to the error callback. + if ok then + table.insert(initialized_feed_items, { + html = response.html, + images = response.images, + item_slug = FeedSource:getItemTitleWithDate(item), + item_title = item.title, + md5 = md5(item.title), + feed_title = feed.document.title, + }) + else + error_callback( + T(_("Could not get content for: %1"), feed.document.title) + ) + end + + end + + if #initialized_feed_items > 0 then + return initialized_feed_items + else + return nil + end +end + +function FeedSource:initializeItemHtml(feed, html) + local url = feed.config[1] + -- local download_full_article = feed.config.download_full_article ~= false + local include_images = feed.config.include_images ~= false + local filter_element = feed.config.filter_element or + feed.config.filter_element == nil + local enable_filter = feed.config.enable_filter ~= false + local item_images, item_html = DownloadBackend:getImagesAndHtml( + html, + url, + include_images, + enable_filter, + filter_element + ) + return { + html = item_html, + images = item_images + } +end + +function FeedSource:getFeedType(document, rss_cb, atom_cb) + -- Check to see if the feed uses RSS. + local is_rss = document.rss and + document.rss.channel and + document.rss.channel.title and + document.rss.channel.item and + document.rss.channel.item[1] and + document.rss.channel.item[1].title and + document.rss.channel.item[1].link + -- Check to see if the feed uses Atom. + local is_atom = document.feed and + document.feed.title and + document.feed.entry[1] and + document.feed.entry[1].title and + document.feed.entry[1].link + -- Setup the feed values based on feed type + if is_atom then + return atom_cb() + elseif is_rss then + return rss_cb() + end + -- Return the values through our callback, or call an + -- error message if the feed wasn't RSS or Atom + if not is_rss or not is_atom then + local error_message + if not is_rss then + error_message = _("(Reason: Couldn't process RSS)") + elseif not is_atom then + error_message = _("(Reason: Couldn't process Atom)") + end + error(error_message) + end +end + +function FeedSource:getItemHtml(item, download_full_article) + if download_full_article then + return NewsHelpers:loadPage( + FeedSource:getFeedLink(item.link) + ) + else + local feed_description = item.description or item.summary + local footer = _("This is just a description of the feed. To download the full article instead, go to the News Downloader settings and change 'download_full_article' to 'true'.") + return string.format([[ + +%s +

%s

%s
+
%s
+ +]], item.title, item.title, feed_description, footer) + end +end + +-- @todo: move this elsewhere +function FeedSource:getEpubOutputDir(download_dir, sub_dir, epub_title) + + local feed_output_dir = ("%s%s/"):format( + download_dir, + util.getSafeFilename(util.htmlEntitiesToUtf8(sub_dir))) + + -- Create the output directory if it doesn't exist. + if not lfs.attributes(feed_output_dir, "mode") then + lfs.mkdir(feed_output_dir) + end + + local file_name = FeedSource:getFeedTitle(epub_title) + + return ("%s%s%s"):format( + feed_output_dir, + file_name, + self.file_extension + ) +end + +function FeedSource:createEpub(title, chapters, abs_output_path, progress_callback, error_callback) + + local file_exists = lfs.attributes(abs_output_path, "mode") + + if file_exists then + logger.dbg("NewsDownloader: Skipping. EPUB file already exists", abs_output_path) + return true + end + + if #chapters == 0 then + error(_("Error: chapters contains 0 items"), 0) + end + + local images = {} + + for index, chapter in ipairs(chapters) do + for jndex, image in ipairs(chapter.images) do + table.insert( + images, + image + ) + end + end + + local epub = DownloadBackend:new{} + + progress_callback(T(_("Building EPUB %1"), title)) + epub:setTitle(title) + epub:addToc(chapters) + epub:addManifest(chapters, images) + + progress_callback(T(_("Building EPUB %1: %2"), title, _("Adding contents"))) + epub:addContents(chapters) + + progress_callback(T(_("Building EPUB %1: %2"), title, _("Adding images"))) + epub:addImages(images) + + progress_callback(T(_("Building EPUB %1: %2"), title, _("Writing EPUB to disk"))) + local ok = pcall(function() + return epub:build(abs_output_path) + end) + + if ok then + if lfs.attributes(abs_output_path, "mode") then + return true + end + end + + return false +end + +local function parseDate(dateTime) + -- Uses lua-feedparser https://github.com/slact/lua-feedparser + -- feedparser is available under the (new) BSD license. + -- see: koreader/plugins/newsdownloader.koplugin/lib/LICENCE_lua-feedparser + local date = dateparser.parse(dateTime) + return os.date("%y-%m-%d_%H-%M_", date) +end + +function FeedSource:getFeedTitleWithDate(feed) + local title = util.getSafeFilename(FeedSource:getFeedTitle(feed.document.title)) + return os.date("%y-%m-%d_%H-%M_") .. title +end + +-- Creates a title with date from a feed item. +function FeedSource:getItemTitleWithDate(item) + local title = util.getSafeFilename(FeedSource:getFeedTitle(item.title)) + if item.updated then + title = parseDate(item.updated) .. title + elseif item.pubDate then + title = parseDate(item.pubDate) .. title + elseif item.published then + title = parseDate(item.published) .. title + end + return title +end + +-- If a title looks like blabla it'll just be feed.title. +-- If a title looks like blabla then we get a table +-- where [1] is the title string and the attributes are also available. +function FeedSource:getFeedTitle(possible_title) + if type(possible_title) == "string" then + return util.htmlEntitiesToUtf8(possible_title) + elseif possible_title[1] and type(possible_title[1]) == "string" then + return util.htmlEntitiesToUtf8(possible_title[1]) + end +end +-- There can be multiple links. +-- For now we just assume the first link is probably the right one. +--- @todo Write unit tests. +-- Some feeds that can be used for unit test. +-- http://fransdejonge.com/feed/ for multiple links. +-- https://github.com/koreader/koreader/commits/master.atom for single link with attributes. +function FeedSource:getFeedLink(possible_link) + local E = {} + if type(possible_link) == "string" then + return possible_link + elseif (possible_link._attr or E).href then + return possible_link._attr.href + elseif ((possible_link[1] or E)._attr or E).href then + return possible_link[1]._attr.href + end +end + + +return FeedSource diff --git a/plugins/newsdownloader.koplugin/feed_view.lua b/plugins/newsdownloader.koplugin/feed_view.lua index 2e68d637e..b52f30f6d 100644 --- a/plugins/newsdownloader.koplugin/feed_view.lua +++ b/plugins/newsdownloader.koplugin/feed_view.lua @@ -7,7 +7,10 @@ local FeedView = { DOWNLOAD_FULL_ARTICLE = "download_full_article", INCLUDE_IMAGES = "include_images", ENABLE_FILTER = "enable_filter", - FILTER_ELEMENT = "filter_element" + FILTER_ELEMENT = "filter_element", + VOLUMIZE = "volumize", + ACTION_RESET_HISTORY = "reset_history", + ACTION_DELETE_FEED = "delete_feed", } function FeedView:getList(feed_config, callback, edit_feed_attribute_callback, delete_feed_callback) @@ -49,7 +52,7 @@ function FeedView:getList(feed_config, callback, edit_feed_attribute_callback, d return view_content end -function FeedView:getItem(id, feed, edit_feed_callback, delete_feed_callback) +function FeedView:getItem(id, feed, edit_feed_callback, feed_action_callback) logger.dbg("NewsDownloader:", feed) @@ -67,6 +70,7 @@ function FeedView:getItem(id, feed, edit_feed_callback, delete_feed_callback) local include_images = feed.include_images ~= false local enable_filter = feed.enable_filter ~= false local filter_element = feed.filter_element + local volumize = feed.volumize ~= false local vc = { { @@ -136,11 +140,22 @@ function FeedView:getItem(id, feed, edit_feed_callback, delete_feed_callback) ) end }, + { + _("Volumize feed"), + volumize, + callback = function() + edit_feed_callback( + id, + FeedView.VOLUMIZE, + volumize + ) + end + }, } - -- We don't always display this. For instance: if a feed - -- is being created, this button is not necessary. - if delete_feed_callback then + -- These actions only pertain to initiated feeds, so we don't always + -- display them. + if feed_action_callback then table.insert( vc, "---" @@ -151,8 +166,22 @@ function FeedView:getItem(id, feed, edit_feed_callback, delete_feed_callback) _("Delete feed"), "", callback = function() - delete_feed_callback( - id + feed_action_callback( + id, + FeedView.ACTION_DELETE_FEED + ) + end + } + ) + table.insert( + vc, + { + _("Reset feed history"), + "", + callback = function() + feed_action_callback( + url, + FeedView.ACTION_RESET_HISTORY ) end } diff --git a/plugins/newsdownloader.koplugin/http_utilities.lua b/plugins/newsdownloader.koplugin/http_utilities.lua new file mode 100644 index 000000000..d79fc830e --- /dev/null +++ b/plugins/newsdownloader.koplugin/http_utilities.lua @@ -0,0 +1,126 @@ +local logger = require("logger") +local http = require("socket.http") +local socketutil = require("socketutil") +local socket_url = require("socket.url") +local socket = require("socket") +local ltn12 = require("ltn12") + +local NewsHelpers = { +} + +local max_redirects = 5; --prevent infinite redirects + +-- Get URL content +function NewsHelpers:getUrlContent(url, timeout, maxtime, redirectCount) + logger.dbg("getUrlContent(", url, ",", timeout, ",", maxtime, ",", redirectCount, ")") + if not redirectCount then + redirectCount = 0 + elseif redirectCount == max_redirects then + error("EpubDownloadBackend: reached max redirects: ", redirectCount) + end + + if not timeout then timeout = 10 end + logger.dbg("timeout:", timeout) + + local sink = {} + local parsed = socket_url.parse(url) + socketutil:set_timeout(timeout, maxtime or 30) + local request = { + url = url, + method = "GET", + sink = maxtime and socketutil.table_sink(sink) or ltn12.sink.table(sink), + } + logger.dbg("request:", request) + local code, headers, status = socket.skip(1, http.request(request)) + socketutil:reset_timeout() + logger.dbg("After http.request") + local content = table.concat(sink) -- empty or content accumulated till now + logger.dbg("type(code):", type(code)) + logger.dbg("code:", code) + logger.dbg("headers:", headers) + logger.dbg("status:", status) + logger.dbg("#content:", #content) + + if code == socketutil.TIMEOUT_CODE or + code == socketutil.SSL_HANDSHAKE_CODE or + code == socketutil.SINK_TIMEOUT_CODE + then + logger.warn("request interrupted:", code) + return false, code + end + if headers == nil then + logger.warn("No HTTP headers:", code, status) + return false, "Network or remote server unavailable" + end + if not code or string.sub(code, 1, 1) ~= "2" then -- all 200..299 HTTP codes are OK + if code and code > 299 and code < 400 and headers and headers.location then -- handle 301, 302... + local redirected_url = headers.location + local parsed_redirect_location = socket_url.parse(redirected_url) + if not parsed_redirect_location.host then + parsed_redirect_location.host = parsed.host + parsed_redirect_location.scheme = parsed.scheme + redirected_url = socket_url.build(parsed_redirect_location) + end + logger.dbg("getUrlContent: Redirecting to url: ", redirected_url) + return self:getUrlContent(redirected_url, timeout, maxtime, redirectCount + 1) + else + -- error("EpubDownloadBackend: Don't know how to handle HTTP response status: " .. status) + -- error("EpubDownloadBackend: Don't know how to handle HTTP response status.") + logger.warn("HTTP status not okay:", code, status) + return false, status + end + end + if headers and headers["content-length"] then + -- Check we really got the announced content size + local content_length = tonumber(headers["content-length"]) + if #content ~= content_length then + return false, "Incomplete content received" + end + end + logger.dbg("Returning content ok") + return true, content +end + +function NewsHelpers:loadPage(url) + logger.dbg("Load page: ", url) + local success, content +--[[ if self.trap_widget then -- if previously set with EpubDownloadBackend:setTrapWidget() + local Trapper = require("ui/trapper") + local timeout, maxtime = 30, 60 + -- We use dismissableRunInSubprocess with complex return values: + completed, success, content = Trapper:dismissableRunInSubprocess(function() + return NewsHelpers:getUrlContent(url, timeout, maxtime) + end, self.trap_widget) + if not completed then + error(self.dismissed_error_code) -- "Interrupted by user" + end + else]]-- + local timeout, maxtime = 10, 60 + success, content = NewsHelpers:getUrlContent(url, timeout, maxtime) +-- end + logger.dbg("success:", success, "type(content):", type(content), "content:", content:sub(1, 500), "...") + if not success then + error(content) + else + return content + end +end + +function NewsHelpers:deserializeXMLString(xml_str) + -- uses LuaXML https://github.com/manoelcampos/LuaXML + -- The MIT License (MIT) + -- Copyright (c) 2016 Manoel Campos da Silva Filho + -- see: koreader/plugins/newsdownloader.koplugin/lib/LICENSE_LuaXML + local treehdl = require("lib/handler") + local libxml = require("lib/xml") + -- Instantiate the object that parses the XML file as a Lua table. + local xmlhandler = treehdl.simpleTreeHandler() + -- Instantiate the object that parses the XML to a Lua table. + local ok = pcall(function() + libxml.xmlParser(xmlhandler):parse(xml_str) + end) + if not ok then return end + return xmlhandler.root +end + +return NewsHelpers diff --git a/plugins/newsdownloader.koplugin/main.lua b/plugins/newsdownloader.koplugin/main.lua index 790aaa03e..029affbac 100644 --- a/plugins/newsdownloader.koplugin/main.lua +++ b/plugins/newsdownloader.koplugin/main.lua @@ -1,11 +1,9 @@ local BD = require("ui/bidi") local DataStorage = require("datastorage") ---local DownloadBackend = require("internaldownloadbackend") ---local DownloadBackend = require("luahttpdownloadbackend") -local DownloadBackend = require("epubdownloadbackend") local ReadHistory = require("readhistory") local FFIUtil = require("ffi/util") local FeedView = require("feed_view") +local FeedSource = require("feed_source") local InfoMessage = require("ui/widget/infomessage") local LuaSettings = require("frontend/luasettings") local UIManager = require("ui/uimanager") @@ -15,7 +13,6 @@ local MultiConfirmBox = require("ui/widget/multiconfirmbox") local NetworkMgr = require("ui/network/manager") local Persist = require("persist") local WidgetContainer = require("ui/widget/container/widgetcontainer") -local dateparser = require("lib.dateparser") local logger = require("logger") local util = require("util") local _ = require("gettext") @@ -27,10 +24,11 @@ local NewsDownloader = WidgetContainer:new{ feed_config_file = "feed_config.lua", feed_config_path = nil, news_config_file = "news_settings.lua", + news_history_file = "news_history.lua", settings = nil, + history = nil, download_dir_name = "news", download_dir = nil, - file_extension = ".epub", config_key_custom_dl_dir = "custom_dl_dir", empty_feed = { [1] = "https://", @@ -38,46 +36,12 @@ local NewsDownloader = WidgetContainer:new{ download_full_article = true, include_images = true, enable_filter = false, - filter_element = "" + filter_element = "", + volumize = false }, kv = {} } -local FEED_TYPE_RSS = "rss" -local FEED_TYPE_ATOM = "atom" - ---local initialized = false ---local feed_config_file_name = "feed_config.lua" ---local news_downloader_config_file = "news_downloader_settings.lua - --- If a title looks like blabla it'll just be feed.title. --- If a title looks like blabla then we get a table --- where [1] is the title string and the attributes are also available. -local function getFeedTitle(possible_title) - if type(possible_title) == "string" then - return util.htmlEntitiesToUtf8(possible_title) - elseif possible_title[1] and type(possible_title[1]) == "string" then - return util.htmlEntitiesToUtf8(possible_title[1]) - end -end - --- There can be multiple links. --- For now we just assume the first link is probably the right one. ---- @todo Write unit tests. --- Some feeds that can be used for unit test. --- http://fransdejonge.com/feed/ for multiple links. --- https://github.com/koreader/koreader/commits/master.atom for single link with attributes. -local function getFeedLink(possible_link) - local E = {} - if type(possible_link) == "string" then - return possible_link - elseif (possible_link._attr or E).href then - return possible_link._attr.href - elseif ((possible_link[1] or E)._attr or E).href then - return possible_link[1]._attr.href - end -end - function NewsDownloader:init() self.ui.menu:registerToMainMenu(self) end @@ -105,7 +69,39 @@ function NewsDownloader:getSubMenuItems() text = _("Sync news feeds"), keep_menu_open = true, callback = function(touchmenu_instance) - NetworkMgr:runWhenOnline(function() self:loadConfigAndProcessFeedsWithUI(touchmenu_instance) end) + NetworkMgr:runWhenOnline( + function() self:syncAllFeedsWithUI( + touchmenu_instance, + function(feed_message) + -- Callback to fire after sync is finished + local UI = require("ui/trapper") + -- This callback is called after the + -- processing is complete. + -- + -- Clear the info widgets before displaying the next ui widget. + -- UI:clear() + -- Ask the user if they want to go to their downloads folder + -- or if they'd rather remain at the menu. + feed_message = feed_message _("Go to downloaders folder?") + local should_go_to_downloads = UI:confirm( + feed_message, + _("Close"), + _("Go to downloads") + ) + if should_go_to_downloads then + -- Go to downloads folder. + UI:clear() + self:openDownloadsFolder() + touchmenu_instance:closeMenu() + NetworkMgr:afterWifiAction() + return + else + -- Return to the menu. + NetworkMgr:afterWifiAction() + return + end + end + ) end) end, }, { @@ -127,17 +123,6 @@ function NewsDownloader:getSubMenuItems() keep_menu_open = true, callback = function() self:setCustomDownloadDirectory() end, }, - { - text = _("Never download images"), - keep_menu_open = true, - checked_func = function() - return self.settings:isTrue("never_download_images") - end, - callback = function() - self.settings:toggle("never_download_images") - self.settings:flush() - end, - }, { text = _("Delete all downloaded items"), keep_menu_open = true, @@ -151,6 +136,9 @@ function NewsDownloader:getSubMenuItems() ) if should_delete then self:removeNewsButKeepFeedConfig() + -- Move user to the downloads folder to avoid an error where they + -- are within a feed folder which we have just deleted. + self:openDownloadsFolder() Trapper:reset() else Trapper:reset() @@ -173,7 +161,7 @@ function NewsDownloader:getSubMenuItems() } return sub_item_table end --- lazyInitialization sets up variables that point to the +-- lazyInitialization sets up our variables to point to the -- Downloads folder and the feeds configuration file. function NewsDownloader:lazyInitialization() if not self.initialized then @@ -188,6 +176,8 @@ function NewsDownloader:lazyInitialization() DataStorage:getFullDataDir(), self.download_dir_name) end + logger.dbg("NewsDownloader: initializing download history") + self.history = LuaSettings:open(("%s/%s"):format(DataStorage:getSettingsDir(), self.news_history_file)) logger.dbg("NewsDownloader: Custom directory set to:", self.download_dir) -- If the directory doesn't exist we will create it. if not lfs.attributes(self.download_dir, "mode") then @@ -205,15 +195,16 @@ function NewsDownloader:lazyInitialization() self.initialized = true end end - -function NewsDownloader:loadConfigAndProcessFeeds(touchmenu_instance) +-- This function loads the config file. If the config is not available +-- then this function includes prompts for handling that. +function NewsDownloader:loadConfig() local UI = require("ui/trapper") logger.dbg("force repaint due to upcoming blocking calls") - + -- Check if the feed config file exists local ok, feed_config = pcall(dofile, self.feed_config_path) if not ok or not feed_config then UI:info(T(_("Invalid configuration file. Detailed error message:\n%1"), feed_config)) - return + return false end -- If the file contains no table elements, then the user hasn't set any feeds. if #feed_config <= 0 then @@ -237,344 +228,198 @@ function NewsDownloader:loadConfigAndProcessFeeds(touchmenu_instance) feed_item_vc ) end - return + return false end - - local never_download_images = self.settings:isTrue("never_download_images") - local unsupported_feeds_urls = {} - local total_feed_entries = #feed_config - local feed_message - - for idx, feed in ipairs(feed_config) do - local url = feed[1] - local limit = feed.limit - local download_full_article = feed.download_full_article == nil or feed.download_full_article - local include_images = not never_download_images and feed.include_images - local enable_filter = feed.enable_filter or feed.enable_filter == nil - local filter_element = feed.filter_element or feed.filter_element == nil - -- Check if the two required attributes are set. - if url and limit then - feed_message = T(_("Processing %1/%2:\n%3"), idx, total_feed_entries, BD.url(url)) - UI:info(feed_message) - -- Process the feed source. - self:processFeedSource( - url, - tonumber(limit), - unsupported_feeds_urls, - download_full_article, - include_images, - feed_message, - enable_filter, - filter_element) - else - logger.warn("NewsDownloader: invalid feed config entry.", feed) - end - end - - if #unsupported_feeds_urls <= 0 then - -- When no errors are present, we get a happy message. - feed_message = _("Downloading news finished.") - else - -- When some errors are present, we get a sour message that includes - -- information about the source of the error. - local unsupported_urls = "" - for key, value in pairs(unsupported_feeds_urls) do - -- Create the error message. - unsupported_urls = unsupported_urls .. " " .. value[1] .. " " .. value[2] - -- Not sure what this does. - if key ~= #unsupported_feeds_urls then - unsupported_urls = BD.url(unsupported_urls) .. ", " - end - end - -- Tell the user there were problems. - feed_message = _("Downloading news finished with errors.") - -- Display a dialogue that requires the user to acknowledge - -- that errors occured. - UI:confirm( - T(_([[ -Could not process some feeds. -Unsupported format in: %1. Please -review your feed configuration file.]]) - , unsupported_urls), - _("Continue"), - "" - ) - end - -- Clear the info widgets before displaying the next ui widget. - UI:clear() - -- Check to see if this method was called from the menu. If it was, - -- we will have gotten a touchmenu_instance. This will context gives the user - -- two options about what to do next, which are handled by this block. - if touchmenu_instance then - -- Ask the user if they want to go to their downloads folder - -- or if they'd rather remain at the menu. - feed_message = feed_message .. _("Go to download folder?") - local should_go_to_downloads = UI:confirm( - feed_message, - _("Close"), - _("Go to downloads") - ) - if should_go_to_downloads then - -- Go to downloads folder. - UI:clear() - self:openDownloadsFolder() - touchmenu_instance:closeMenu() - NetworkMgr:afterWifiAction() - return - else - -- Return to the menu. - NetworkMgr:afterWifiAction() - return - end - end - return + -- If we made it this far, then the feed config is valid + -- and the next step is to process its contents + return feed_config end -function NewsDownloader:loadConfigAndProcessFeedsWithUI(touchmenu_instance) +function NewsDownloader:syncAllFeedsWithUI(touchmenu_instance, callback) local Trapper = require("ui/trapper") Trapper:wrap(function() - self:loadConfigAndProcessFeeds(touchmenu_instance) - end) -end - -function NewsDownloader:processFeedSource(url, limit, unsupported_feeds_urls, download_full_article, include_images, message, enable_filter, filter_element) - local ok, response = pcall(function() - return DownloadBackend:getResponseAsString(url) - end) - local feeds - -- Check to see if a response is available to deserialize. - if ok then - feeds = self:deserializeXMLString(response) - end - -- If the response is not available (for a reason that we don't know), - -- add the URL to the unsupported feeds list. - if not ok or not feeds then - local error_message - if not ok then - error_message = _("(Reason: Failed to download content)") - else - error_message = _("(Reason: Error during feed deserialization)") - end - table.insert( - unsupported_feeds_urls, - { - url, - error_message - } - ) - return - end - -- Check to see if the feed uses RSS. - local is_rss = feeds.rss - and feeds.rss.channel - and feeds.rss.channel.title - and feeds.rss.channel.item - and feeds.rss.channel.item[1] - and feeds.rss.channel.item[1].title - and feeds.rss.channel.item[1].link - -- Check to see if the feed uses Atom. - local is_atom = feeds.feed - and feeds.feed.title - and feeds.feed.entry[1] - and feeds.feed.entry[1].title - and feeds.feed.entry[1].link - -- Process the feeds accordingly. - if is_atom then - ok = pcall(function() - return self:processFeed( - FEED_TYPE_ATOM, - feeds, - limit, - download_full_article, - include_images, - message, - enable_filter, - filter_element - ) - end) - elseif is_rss then - ok = pcall(function() - return self:processFeed( - FEED_TYPE_RSS, - feeds, - limit, - download_full_article, - include_images, - message, - enable_filter, - filter_element - ) - end) - end - -- If the feed can't be processed, or it is neither - -- Atom or RSS, then add it to the unsupported feeds list - -- and return an error message. - if not ok or (not is_rss and not is_atom) then - local error_message - if not ok then - error_message = _("(Reason: Failed to download content)") - elseif not is_rss then - error_message = _("(Reason: Couldn't process RSS)") - elseif not is_atom then - error_message = _("(Reason: Couldn't process Atom)") - end - table.insert( - unsupported_feeds_urls, - { - url, - error_message - } - ) - end -end - -function NewsDownloader:deserializeXMLString(xml_str) - -- uses LuaXML https://github.com/manoelcampos/LuaXML - -- The MIT License (MIT) - -- Copyright (c) 2016 Manoel Campos da Silva Filho - -- see: koreader/plugins/newsdownloader.koplugin/lib/LICENSE_LuaXML - local treehdl = require("lib/handler") - local libxml = require("lib/xml") - -- Instantiate the object that parses the XML file as a Lua table. - local xmlhandler = treehdl.simpleTreeHandler() - -- Instantiate the object that parses the XML to a Lua table. - local ok = pcall(function() - libxml.xmlParser(xmlhandler):parse(xml_str) - end) - if not ok then return end - return xmlhandler.root -end - -function NewsDownloader:processFeed(feed_type, feeds, limit, download_full_article, include_images, message, enable_filter, filter_element) - local feed_title - local feed_item - local total_items - -- Setup the above vars based on feed type. - if feed_type == FEED_TYPE_RSS then - feed_title = util.htmlEntitiesToUtf8(feeds.rss.channel.title) - feed_item = feeds.rss.channel.item - total_items = (limit == 0) - and #feeds.rss.channel.item - or limit - else - feed_title = getFeedTitle(feeds.feed.title) - feed_item = feeds.feed.entry - total_items = (limit == 0) - and #feeds.feed.entry - or limit - end - -- Get the path to the output directory. - local feed_output_dir = ("%s%s/"):format( - self.download_dir, - util.getSafeFilename(util.htmlEntitiesToUtf8(feed_title))) - -- Create the output directory if it doesn't exist. - if not lfs.attributes(feed_output_dir, "mode") then - lfs.mkdir(feed_output_dir) - end - -- Download the feed - for index, feed in pairs(feed_item) do - -- If limit has been met, stop downloading feed. - if limit ~= 0 and index - 1 == limit then - break - end - -- Create a message to display during processing. - local article_message = T( - _("%1\n\nFetching article %2/%3:"), - message, - index, - total_items - ) - -- Get the feed description. - local feed_description - if feed_type == FEED_TYPE_RSS then - feed_description = feed.description - else - feed_description = feed.summary - end - -- Download the article. - if download_full_article then - self:downloadFeed( - feed, - feed_output_dir, - include_images, - article_message, - enable_filter, - filter_element + local UI = require("ui/trapper") + -- Get the config + local config = self:loadConfig() + local sync_errors = {} + -- Get the HTML for the feeds + local feedSource = FeedSource:new{} + -- Get the initialized feeds list + local initialized_feeds = feedSource:getInitializedFeeds( + config, + function(progress_message) + -- This callback relays updates to the UI + UI:info(progress_message) + end, + function(error_message) + table.insert( + sync_errors, + error_message + ) + end ) - else - self:createFromDescription( - feed, - feed_description, - feed_output_dir, - include_images, - article_message - ) - end - end -end + -- In this block, each feed item will be its own + -- epub complete with title and chapters + local epubs_to_make = {} + local epubs_successfully_created = {} + local feed_history = {} -local function parseDate(dateTime) - -- Uses lua-feedparser https://github.com/slact/lua-feedparser - -- feedparser is available under the (new) BSD license. - -- see: koreader/plugins/newsdownloader.koplugin/lib/LICENCE_lua-feedparser - local date = dateparser.parse(dateTime) - return os.date("%y-%m-%d_%H-%M_", date) -end + for feed_index, feed in pairs(initialized_feeds) do + -- Go through each feed and make new entry + local items_content = feedSource:getItemsContent( + feed, + function(progress_message) + UI:info(progress_message) + end, + function(error_message) + table.insert( + sync_errors, + error_message + ) + end + ) --- This appears to be used by Atom feeds in processFeed. -local function getTitleWithDate(feed) - local title = util.getSafeFilename(getFeedTitle(feed.title)) - if feed.updated then - title = parseDate(feed.updated) .. title - elseif feed.pubDate then - title = parseDate(feed.pubDate) .. title - elseif feed.published then - title = parseDate(feed.published) .. title - end - return title -end + local volumize = feed.config.volumize ~= false + local chapters = {} + local feed_title = feedSource:getFeedTitleWithDate(feed) + local feed_id = feed.config[1] -- The url. + local sub_dir = feedSource:getFeedTitle(feed.document.title) + local item_history = {} -function NewsDownloader:downloadFeed(feed, feed_output_dir, include_images, message, enable_filter, filter_element) - local title_with_date = getTitleWithDate(feed) - local news_file_path = ("%s%s%s"):format(feed_output_dir, - title_with_date, - self.file_extension) + for content_index, content in pairs(items_content) do + -- Check to see if we've already downloaded this item. + local history_for_feed = self.history:child(feed_id) - local file_mode = lfs.attributes(news_file_path, "mode") - if file_mode == "file" then - logger.dbg("NewsDownloader:", news_file_path, "already exists. Skipping") - else - logger.dbg("NewsDownloader: News file will be stored to :", news_file_path) - local article_message = T(_("%1\n%2"), message, title_with_date) - local link = getFeedLink(feed.link) - local html = DownloadBackend:loadPage(link) - DownloadBackend:createEpub(news_file_path, html, link, include_images, article_message, enable_filter, filter_element) - end -end + if history_for_feed:has(content.md5) then + logger.dbg("NewsDownloader: ", "Item already downloaded") + UI:info(_("Skipping downloaded item")) + else + local abs_path = feedSource:getEpubOutputDir( + self.download_dir, + sub_dir, + content.item_title + ) -function NewsDownloader:createFromDescription(feed, content, feed_output_dir, include_images, message) - local title_with_date = getTitleWithDate(feed) - local news_file_path = ("%s%s%s"):format(feed_output_dir, - title_with_date, - self.file_extension) - local file_mode = lfs.attributes(news_file_path, "mode") - if file_mode == "file" then - logger.dbg("NewsDownloader:", news_file_path, "already exists. Skipping") - else - logger.dbg("NewsDownloader: News file will be stored to :", news_file_path) - local article_message = T(_("%1\n%2"), message, title_with_date) - local footer = _("This is just a description of the feed. To download the full article instead, go to the News Downloader settings and change 'download_full_article' to 'true'.") + -- Not sure the slug returned is what we want. + -- Should be something like 2022_09_20-ArticleTitle + table.insert( + chapters, + { + title = content.item_title, + slug = content.item_slug, + md5 = content.md5, + html = content.html, + images = content.images + } + ) - local html = string.format([[ - -%s -

%s

%s
-
- -]], feed.title, feed.title, content, footer) - local link = getFeedLink(feed.link) - DownloadBackend:createEpub(news_file_path, html, link, include_images, article_message) - end + if not volumize then + -- We're not volumizing, so each chapter + -- will be its own epub. + table.insert( + epubs_to_make, + { + title = content.item_title, + chapters = chapters, + abs_path = abs_path, + id = feed_id, + } + ) + -- Reset the chapters list. + chapters = {} + end + + table.insert( + item_history, + content.md5 + ) + end + end + -- We're volumizing, so all of the chapters we collected + -- get added to a single epub. + if volumize and #chapters > 0 then + local abs_path = feedSource:getEpubOutputDir( + self.download_dir, + sub_dir, + feed_title + ) + + table.insert( + epubs_to_make, + { + title = feed_title, + chapters = chapters, + abs_path = abs_path, + id = feed_id, + } + ) + end + + feed_history[feed_id] = item_history + end + + -- Make each EPUB. + for epub_index, epub in pairs(epubs_to_make) do + local ok = feedSource:createEpub( + epub.title, + epub.chapters, + epub.abs_path, + function(progress_message) + UI:info(progress_message) + end, + function(error_message) + table.insert( + sync_errors, + error_message + ) + end + ) + if ok then + -- Save the hashes to the setting for this feed. + local hashes_to_save = feed_history[epub.id] + local history_for_feed = self.history:child(epub.id) + + for index, hash in ipairs(hashes_to_save) do + if history_for_feed:hasNot(hash) then + history_for_feed:saveSetting(hash, true) + end + end + -- Add the epub title to the successfully created table. + table.insert( + epubs_successfully_created, + epub.title + ) + else + table.insert( + sync_errors, + T( + _('Error building EPUB %1'), + epub.title + ) + ) + end + end + + logger.dbg(epubs_to_make) + + self.history:flush() + + -- Relay any errors + for index, error_message in pairs(sync_errors) do + UI:confirm( + error_message, + _("Continue"), + "" + ) + end + + local message = (#epubs_successfully_created == 0) and + _("Sync complete. No new EPUBs created.") or + T(_("Sync complete. EPUBs created: %1"), + table.concat(epubs_successfully_created, ", ")) + + callback(message) + end) end function NewsDownloader:removeNewsButKeepFeedConfig() @@ -591,7 +436,7 @@ function NewsDownloader:removeNewsButKeepFeedConfig() end end UIManager:show(InfoMessage:new{ - text = _("All downloaded news feed items deleted.") + text = _("All downloaded news feed items deleted. To download these again in the future, reset the feed history.") }) end @@ -612,11 +457,10 @@ function NewsDownloader:setCustomDownloadDirectory() end function NewsDownloader:viewFeedList() - local UI = require("ui/trapper") - UI:info(_("Loading news feed list…")) -- Protected call to see if feed config path returns a file that can be opened. local ok, feed_config = pcall(dofile, self.feed_config_path) if not ok or not feed_config then + local UI = require("ui/trapper") local change_feed_config = UI:confirm( _("Could not open feed list. Feeds configuration file is invalid."), _("Close"), @@ -627,15 +471,6 @@ function NewsDownloader:viewFeedList() end return end - UI:clear() - -- See if the config file contains any feed items - if #feed_config <= 0 then - logger.err("NewsDownloader: empty feed list.", self.feed_config_path) - -- Why not ask the user if they want to add one? - -- Or, in future, move along to our list UI with an entry for new feeds - - --return - end local view_content = FeedView:getList( feed_config, @@ -647,8 +482,25 @@ function NewsDownloader:viewFeedList() function(id, edit_key, value) self:editFeedAttribute(id, edit_key, value) end, - function(id) - self:deleteFeed(id) + function(id, action) + if action == FeedView.ACTION_DELETE_FEED then + self:deleteFeed(id) + elseif action == FeedView.ACTION_RESET_HISTORY then + local Trapper = require("ui/trapper") + Trapper:wrap(function() + local should_reset = Trapper:confirm( + _("Are you sure you want to reset the feed history? Proceeding will cause items to be re-downloaded next time you sync."), + _("Cancel"), + _("Reset") + ) + if should_reset then + self:resetFeedHistory(id) + Trapper:reset() + else + Trapper:reset() + end + end) + end end ) -- Add a "Add new feed" button with callback @@ -704,10 +556,15 @@ end function NewsDownloader:editFeedAttribute(id, key, value) local kv = self.kv - -- There are basically two types of values: string (incl. numbers) - -- and booleans. This block chooses what type of value our - -- attribute will need and displays the corresponding dialog. - if key == FeedView.URL + -- This block determines what kind of UI to produce, or action to run, + -- based on the key value. Some values need an input dialog, others need + -- a Yes/No dialog. + if key == FeedView.RESET_HISTORY then + -- Show a "are you sure" box. + -- Reset the history + self.history:removeTableItem(value, 1) + self.history:flush() + elseif key == FeedView.URL or key == FeedView.LIMIT or key == FeedView.FILTER_ELEMENT then @@ -767,6 +624,8 @@ function NewsDownloader:editFeedAttribute(id, key, value) text = _("Include images?") elseif key == FeedView.ENABLE_FILTER then text = _("Enable CSS filter?") + elseif key == FeedView.VOLUMIZE then + text = _("Volumize feed?") end local multi_box @@ -810,6 +669,7 @@ function NewsDownloader:updateFeedConfig(id, key, value) end local ok, feed_config = pcall(dofile, self.feed_config_path) + if not ok or not feed_config then UI:info(T(_("Invalid configuration file. Detailed error message:\n%1"), feed_config)) return @@ -818,7 +678,6 @@ function NewsDownloader:updateFeedConfig(id, key, value) if #feed_config <= 0 then logger.dbg("NewsDownloader: empty feed list.", self.feed_config_path) end - -- Check to see if the id is larger than the number of feeds. If it is, -- then we know this is a new add. Insert the base array. if id > #feed_config then @@ -852,65 +711,17 @@ function NewsDownloader:updateFeedConfig(id, key, value) ) end elseif key == FeedView.LIMIT then - if feed.limit then - feed.limit = value - else - table.insert( - feed, - { - "limit", - value - } - ) - end + feed.limit = value elseif key == FeedView.DOWNLOAD_FULL_ARTICLE then - if feed.download_full_article ~= nil then - feed.download_full_article = value - else - table.insert( - feed, - { - "download_full_article", - value - } - ) - end + feed.download_full_article = value elseif key == FeedView.INCLUDE_IMAGES then - if feed.include_images ~= nil then - feed.include_images = value - else - table.insert( - feed, - { - "include_images", - value - } - ) - end + feed.include_images = value elseif key == FeedView.ENABLE_FILTER then - if feed.enable_filter ~= nil then - feed.enable_filter = value - else - table.insert( - feed, - { - "enable_filter", - value - } - ) - end + feed.enable_filter = value elseif key == FeedView.FILTER_ELEMENT then - if feed.filter_element then - feed.filter_element = value - else - table.insert( - feed, - { - "filter_element", - value - } - ) - end + feed.filter_element = value + elseif key == FeedView.VOLUMIZE then + feed.volumize = value end end -- Now we insert the updated (or newly created) feed into the @@ -929,12 +740,31 @@ function NewsDownloader:updateFeedConfig(id, key, value) new_config[id], function(cb_id, cb_edit_key, cb_value) self:editFeedAttribute(cb_id, cb_edit_key, cb_value) + end, + function(feed_id, action) + if action == FeedView.ACTION_DELETE_FEED then + self:deleteFeed(feed_id) + elseif action == FeedView.ACTION_RESET_HISTORY then + local Trapper = require("ui/trapper") + Trapper:wrap(function() + local should_reset = Trapper:confirm( + _("Are you sure you want to reset the feed history? Proceeding will cause items to be re-downloaded next time you sync."), + _("Cancel"), + _("Reset") + ) + if should_reset then + self:resetFeedHistory(id) + Trapper:reset() + else + Trapper:reset() + end + end) + end end ) self:viewFeedItem( feed_item_vc ) - end function NewsDownloader:deleteFeed(id) @@ -942,6 +772,7 @@ function NewsDownloader:deleteFeed(id) logger.dbg("Newsdownloader: attempting to delete feed") -- Check to see if we can get the config file. local ok, feed_config = pcall(dofile, self.feed_config_path) + if not ok or not feed_config then UI:info(T(_("Invalid configuration file. Detailed error message:\n%1"), feed_config)) return @@ -951,6 +782,7 @@ function NewsDownloader:deleteFeed(id) -- and key (i.e.: the key that triggered this function. -- If we are at the right spot, we overrite (or create) the value local new_config = {} + for idx, feed in ipairs(feed_config) do -- Check to see if this is the correct feed to update. if idx ~= id then @@ -962,6 +794,7 @@ function NewsDownloader:deleteFeed(id) end -- Save the config local Trapper = require("ui/trapper") + Trapper:wrap(function() logger.dbg("NewsDownloader: config to save", new_config) self:saveConfig(new_config) @@ -970,6 +803,14 @@ function NewsDownloader:deleteFeed(id) self:viewFeedList() end +function NewsDownloader:resetFeedHistory(url) + logger.dbg("Newsdownloader: attempting to reset feed history") + self.history:saveSetting(url, {}) + self.history:flush() + -- Refresh the view + self:viewFeedList() +end + function NewsDownloader:saveConfig(config) local UI = require("ui/trapper") UI:info(_("Saving news feed list…")) @@ -985,6 +826,9 @@ function NewsDownloader:saveConfig(config) UI:reset() end +-- This function opens an input dialog that lets the user +-- manually change their feed config. This function is called +-- when there is an error with the parsing. function NewsDownloader:changeFeedConfig() local feed_config_file = io.open(self.feed_config_path, "rb") local config = feed_config_file:read("*all") @@ -1027,6 +871,7 @@ function NewsDownloader:changeFeedConfig() UIManager:show(config_editor) config_editor:onShowKeyboard() end + function NewsDownloader:openDownloadsFolder() local FileManager = require("apps/filemanager/filemanager") if self.ui.document then @@ -1051,38 +896,4 @@ function NewsDownloader:onCloseDocument() end end --- --- KeyValuePage doesn't like to get a table with sub tables. --- This function flattens an array, moving all nested tables --- up the food chain, so to speak --- -function NewsDownloader:flattenArray(base_array, source_array) - for key, value in pairs(source_array) do - if value[2] == nil then - -- If the value is empty, then it's probably supposed to be a line - table.insert( - base_array, - "---" - ) - else - if value["callback"] then - table.insert( - base_array, - { - value[1], value[2], callback = value["callback"] - } - ) - else - table.insert( - base_array, - { - value[1], value[2] - } - ) - end - end - end - return base_array -end - return NewsDownloader