NewsDownloader: new option to allow EPUB volumization (#8263)

When this feature is enabled on a feed and that feed is synced, all new feed entries will be collected into a new single EPUB file. This is achieved by implementing a feed history feature (downloaded feeds are added as M5D hashes to a LuaSettings file), and by introducing additional methods into epubdownloader.lua that allow for multiple HTML documents to be added into single EPUB file.
2025-08-10 00:52:38 +00:00 · 2022-01-21 14:23:21 -04:00
parent f468f873bd
commit 8a04dc9852
5 changed files with 1163 additions and 828 deletions
--- a/plugins/newsdownloader.koplugin/epubdownloadbackend.lua
+++ b/plugins/newsdownloader.koplugin/epubdownloadbackend.lua
@@ -1,15 +1,10 @@
+local NewsHelpers = require("http_utilities")
 local Version = require("version")
-local ffiutil = require("ffi/util")
-local http = require("socket.http")
 local logger = require("logger")
-local ltn12 = require("ltn12")
-local socket = require("socket")
 local socket_url = require("socket.url")
-local socketutil = require("socketutil")
 local _ = require("gettext")
-local T = ffiutil.template

-local EpubDownloadBackend = {
+local EpubBuilder = {
   -- Can be set so HTTP requests will be done under Trapper and
   -- be interruptible
   trap_widget = nil,
@@ -17,8 +12,89 @@ local EpubDownloadBackend = {
   -- and error() with this code. We make the value of this error
   -- accessible here so that caller can know it's a user dismiss.
   dismissed_error_code = "Interrupted by user",
+   title = nil,
+   ncx_toc = nil,
+   ncx_manifest = nil,
+   ncx_contents = nil,
+   ncx_images = nil,
 }
-local max_redirects = 5; --prevent infinite redirects
+
+function EpubBuilder:new(o)
+    o = o or {}
+    self.__index = self
+    setmetatable(o, self)
+
+    return o
+end
+
+function EpubBuilder:build(abs_output_path)
+    -- Open the zip file (with .tmp for now, as crengine may still
+    -- have a handle to the final epub_path, and we don't want to
+    -- delete a good one if we fail/cancel later)
+    local tmp_path = abs_output_path .. ".tmp"
+    local ZipWriter = require("ffi/zipwriter")
+    local epub = ZipWriter:new{}
+
+    if not epub:open(tmp_path) then
+        logger.dbg("Failed to open tmp_path")
+        return false
+    end
+
+    epub:add("mimetype", "application/epub+zip")
+    epub:add("META-INF/container.xml", [[
+<?xml version="1.0"?>
+<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
+  <rootfiles>
+    <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
+  </rootfiles>
+</container>]])
+
+    -- Add the manifest.
+    if not self.ncx_manifest or #self.ncx_manifest == 0 then
+        error("EPUB does not contain a valid manifest.")
+    end
+    --logger.dbg("Adding Manifest:", self.ncx_manifest)
+    epub:add("OEBPS/content.opf", table.concat(self.ncx_manifest))
+
+    -- Add the table of contents.
+    if not self.ncx_toc or #self.ncx_toc == 0 then
+        error("EPUB does not contain a valid table of contents.")
+    end
+    --logger.dbg("Adding TOC:", self.ncx_toc)
+    epub:add("OEBPS/toc.ncx", table.concat(self.ncx_toc))
+
+    -- Add the contents.
+    if not self.ncx_contents or #self.ncx_manifest == 0 then
+        error("EPUB does not contain any content.")
+    end
+    --logger.dbg("Adding Content:", self.ncx_contents)
+
+    for index, content in ipairs(self.ncx_contents) do
+        epub:add("OEBPS/" .. content.filename, content.html)
+    end
+
+    -- Add the images.
+    --logger.dbg("Adding Images:", self.ncx_images)
+    if self.ncx_images then
+        for index, image in ipairs(self.ncx_images) do
+            epub:add(
+                "OEBPS/" .. image.path,
+                image.content,
+                image.no_compression
+            )
+        end
+    end
+
+    epub:close()
+    os.rename(tmp_path, abs_output_path)
+
+    collectgarbage()
+
+end
+
+function EpubBuilder:release()
+    -- Stub for cleanup methods
+end

 -- filter HTML using CSS selector
 local function filter(text, element)
@@ -68,79 +144,9 @@ local function filter(text, element)
    return "<!DOCTYPE html><html><head></head><body>" .. filtered .. "</body></html>"
 end

-- Get URL content
-local function getUrlContent(url, timeout, maxtime, redirectCount)
-    logger.dbg("getUrlContent(", url, ",", timeout, ",", maxtime, ",", redirectCount, ")")
-    if not redirectCount then
-        redirectCount = 0
-    elseif redirectCount == max_redirects then
-        error("EpubDownloadBackend: reached max redirects: ", redirectCount)
-    end
-
-    if not timeout then timeout = 10 end
-    logger.dbg("timeout:", timeout)
-
-    local sink = {}
-    local parsed = socket_url.parse(url)
-    socketutil:set_timeout(timeout, maxtime or 30)
-    local request = {
-        url     = url,
-        method  = "GET",
-        sink    = maxtime and socketutil.table_sink(sink) or ltn12.sink.table(sink),
-    }
-    logger.dbg("request:", request)
-    local code, headers, status = socket.skip(1, http.request(request))
-    socketutil:reset_timeout()
-    logger.dbg("After http.request")
-    local content = table.concat(sink) -- empty or content accumulated till now
-    logger.dbg("type(code):", type(code))
-    logger.dbg("code:", code)
-    logger.dbg("headers:", headers)
-    logger.dbg("status:", status)
-    logger.dbg("#content:", #content)
-
-    if code == socketutil.TIMEOUT_CODE or
-       code == socketutil.SSL_HANDSHAKE_CODE or
-       code == socketutil.SINK_TIMEOUT_CODE
-    then
-        logger.warn("request interrupted:", code)
-        return false, code
-    end
-    if headers == nil then
-        logger.warn("No HTTP headers:", code, status)
-        return false, "Network or remote server unavailable"
-    end
-    if not code or string.sub(code, 1, 1) ~= "2" then -- all 200..299 HTTP codes are OK
-        if code and code > 299 and code < 400  and headers and headers.location then -- handle 301, 302...
-           local redirected_url = headers.location
-           local parsed_redirect_location = socket_url.parse(redirected_url)
-           if not parsed_redirect_location.host then
-             parsed_redirect_location.host = parsed.host
-             parsed_redirect_location.scheme = parsed.scheme
-             redirected_url = socket_url.build(parsed_redirect_location)
-           end
-           logger.dbg("getUrlContent: Redirecting to url: ", redirected_url)
-           return getUrlContent(redirected_url, timeout, maxtime, redirectCount + 1)
-        else
-           error("EpubDownloadBackend: Don't know how to handle HTTP response status: ", status)
-        end
-        logger.warn("HTTP status not okay:", code, status)
-        return false, "Remote server error or unavailable"
-    end
-    if headers and headers["content-length"] then
-        -- Check we really got the announced content size
-        local content_length = tonumber(headers["content-length"])
-        if #content ~= content_length then
-            return false, "Incomplete content received"
-        end
-    end
-    logger.dbg("Returning content ok")
-    return true, content
-end
-
-function EpubDownloadBackend:getResponseAsString(url)
-    logger.dbg("EpubDownloadBackend:getResponseAsString(", url, ")")
-    local success, content = getUrlContent(url)
+function EpubBuilder:getResponseAsString(url)
+    logger.dbg("EpubBuilder:getResponseAsString(", url, ")")
+    local success, content = NewsHelpers:getUrlContent(url)
    if (success) then
        return content
    else
@@ -148,38 +154,14 @@ function EpubDownloadBackend:getResponseAsString(url)
    end
 end

-function EpubDownloadBackend:setTrapWidget(trap_widget)
+function EpubBuilder:setTrapWidget(trap_widget)
    self.trap_widget = trap_widget
 end

-function EpubDownloadBackend:resetTrapWidget()
+function EpubBuilder:resetTrapWidget()
    self.trap_widget = nil
 end

-function EpubDownloadBackend:loadPage(url)
-    local completed, success, content
-    if self.trap_widget then -- if previously set with EpubDownloadBackend:setTrapWidget()
-        local Trapper = require("ui/trapper")
-        local timeout, maxtime = 30, 60
-        -- We use dismissableRunInSubprocess with complex return values:
-        completed, success, content = Trapper:dismissableRunInSubprocess(function()
-            return getUrlContent(url, timeout, maxtime)
-        end, self.trap_widget)
-        if not completed then
-            error(self.dismissed_error_code) -- "Interrupted by user"
-        end
-    else
-        local timeout, maxtime = 10, 60
-        success, content = getUrlContent(url, timeout, maxtime)
-    end
-    logger.dbg("success:", success, "type(content):", type(content), "content:", content:sub(1, 500), "...")
-    if not success then
-        error(content)
-    else
-        return content
-    end
-end
-
 local ext_to_mimetype = {
    png = "image/png",
    jpg = "image/jpeg",
@@ -195,29 +177,15 @@ local ext_to_mimetype = {
    ttf = "application/truetype",
    woff = "application/font-woff",
 }
-- Create an epub file (with possibly images)
-function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, message, filter_enable, filter_element)
-    logger.dbg("EpubDownloadBackend:createEpub(", epub_path, ")")
-    -- Use Trapper to display progress and ask questions through the UI.
-    -- We need to have been Trapper.wrap()'ed for UI to be used, otherwise
-    -- Trapper:info() and Trapper:confirm() will just use logger.
-    local UI = require("ui/trapper")
-    -- We may need to build absolute urls for non-absolute links and images urls
+-- GetPublishableHtml
+function EpubBuilder:getImagesAndHtml(html, url, include_images, filter_enable, filter_element)
    local base_url = socket_url.parse(url)
-
-    local cancelled = false
-    local page_htmltitle = html:match([[<title>(.*)</title>]])
-    logger.dbg("page_htmltitle is ", page_htmltitle)
--    local sections = html.sections -- Wikipedia provided TOC
-    local bookid = "bookid_placeholder" --string.format("wikipedia_%s_%s_%s", lang, phtml.pageid, phtml.revid)
-    -- Not sure if this bookid may ever be used by indexing software/calibre, but if it is,
-    -- should it changes if content is updated (as now, including the wikipedia revisionId),
-    -- or should it stays the same even if revid changes (content of the same book updated).
-    if filter_enable then html = filter(html, filter_element) end
    local images = {}
    local seen_images = {}
    local imagenum = 1
    local cover_imgid = nil -- best candidate for cover among our images
+    html = filter_enable and filter(html, filter_element) or html
+
    local processImg = function(img_tag)
        local src = img_tag:match([[src="([^"]*)"]])
        if src == nil or src == "" then
@@ -272,13 +240,20 @@ function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, me
                width = width,
                height = height,
            }
-            table.insert(images, cur_image)
+
            seen_images[src] = cur_image
            -- Use first image of reasonable size (not an icon) and portrait-like as cover-image
            if not cover_imgid and width and width > 50 and height and height > 50 and height > width then
                logger.dbg("Found a suitable cover image")
                cover_imgid = imgid
+                cur_image["cover_image"] = true
            end
+
+            table.insert(
+                images,
+                cur_image
+            )
+
            imagenum = imagenum + 1
        end
        -- crengine will NOT use width and height attributes, but it will use
@@ -296,130 +271,53 @@ function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, me
        local style = table.concat(style_props, "; ")
        return string.format([[<img src="%s" style="%s" alt=""/>]], cur_image.imgpath, style)
    end
-    html = html:gsub("(<%s*img [^>]*>)", processImg)
-    logger.dbg("Images found in html:", images)

-    -- See what to do with images
-    local use_img_2x = false
-    if not include_images then
+    if include_images then
+        html = html:gsub("(<%s*img [^>]*>)", processImg)
+    else
        -- Remove img tags to avoid little blank squares of missing images
        html = html:gsub("<%s*img [^>]*>", "")
        -- We could remove the whole image container <div class="thumb"...> ,
        -- but it's a lot of nested <div> and not easy to do.
        -- So the user will see the image legends and know a bit about
-        -- the images he chose to not get.
+        -- the images they chose to not get.
    end

-    UI:info(T(_("%1\n\nBuilding EPUB…"), message))
-    -- Open the zip file (with .tmp for now, as crengine may still
-    -- have a handle to the final epub_path, and we don't want to
-    -- delete a good one if we fail/cancel later)
-    local epub_path_tmp = epub_path .. ".tmp"
-    local ZipWriter = require("ffi/zipwriter")
-    local epub = ZipWriter:new{}
-    if not epub:open(epub_path_tmp) then
-        logger.dbg("Failed to open epub_path_tmp")
-        return false
-    end
+    -- Force a GC to free the memory we used (the second call may help
+    -- reclaim more memory).
+    collectgarbage()
+    collectgarbage()
+    return images, html
+end

-    -- We now create and add all the required epub files
+function EpubBuilder:setTitle(title)
+    self.title = title
+end

-    -- ----------------------------------------------------------------
-    -- /mimetype : always "application/epub+zip"
-    epub:add("mimetype", "application/epub+zip")

-    -- ----------------------------------------------------------------
-    -- /META-INF/container.xml : always the same content
-    epub:add("META-INF/container.xml", [[
-<?xml version="1.0"?>
-<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
-  <rootfiles>
-    <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
-  </rootfiles>
-</container>]])
-    logger.dbg("Added META-INF/container.xml")
-
-    -- ----------------------------------------------------------------
-    -- OEBPS/content.opf : metadata + list of other files (paths relative to OEBPS/ directory)
-    -- Other possible items in this file that are of no interest to crengine :
-    --   In <manifest> :
-    --     <item id="cover" href="title.html" media-type="application/xhtml+xml"/>
-    --     <item id="cover-image" href="images/cover.png" media-type="image/png"/>
-    -- (crengine only uses <meta name="cover" content="cover-image" /> to get the cover image)
-    --   In <spine toc="ncx"> :
-    --     <itemref idref="cover" linear="no"/>
-    --   And a <guide> section :
-    --     <guide>
-    --       <reference href="title.html" type="cover" title="Cover"/>
-    --       <reference href="toc.html" type="toc" title="Table of Contents" href="toc.html" />
-    --     </guide>
-    local content_opf_parts = {}
-    -- head
-    local meta_cover = "<!-- no cover image -->"
-    if include_images and cover_imgid then
-        meta_cover = string.format([[<meta name="cover" content="%s"/>]], cover_imgid)
-    end
-    logger.dbg("meta_cover:", meta_cover)
-    table.insert(content_opf_parts, string.format([[
-<?xml version='1.0' encoding='utf-8'?>
-<package xmlns="http://www.idpf.org/2007/opf"
-        xmlns:dc="http://purl.org/dc/elements/1.1/"
-        unique-identifier="bookid" version="2.0">
-  <metadata>
-    <dc:title>%s</dc:title>
-    <dc:publisher>KOReader %s</dc:publisher>
-    %s
-  </metadata>
-  <manifest>
-    <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
-    <item id="content" href="content.html" media-type="application/xhtml+xml"/>
-    <item id="css" href="stylesheet.css" media-type="text/css"/>
-]], page_htmltitle, Version:getCurrentRevision(), meta_cover))
-    -- images files
-    if include_images then
-        for inum, img in ipairs(images) do
-            table.insert(content_opf_parts, string.format([[    <item id="%s" href="%s" media-type="%s"/>%s]], img.imgid, img.imgpath, img.mimetype, "\n"))
-        end
-    end
-    -- tail
-    table.insert(content_opf_parts, [[
-  </manifest>
-  <spine toc="ncx">
-    <itemref idref="content"/>
-  </spine>
-</package>
-]])
-    epub:add("OEBPS/content.opf", table.concat(content_opf_parts))
-    logger.dbg("Added OEBPS/content.opf")
-
-    -- ----------------------------------------------------------------
-    -- OEBPS/stylesheet.css
-    --- @todo We told it we'd include a stylesheet.css, so it's probably best
-    -- that we do. In theory, we could try to fetch any *.css files linked in
-    -- the main html.
-    epub:add("OEBPS/stylesheet.css", [[
-/* Empty */
-]])
-    logger.dbg("Added OEBPS/stylesheet.css")
-
-    -- ----------------------------------------------------------------
-    -- OEBPS/toc.ncx : table of content
+function EpubBuilder:addToc(chapters)
    local toc_ncx_parts = {}
    local depth = 0
-    local cur_level = 0
-    local np_end = [[</navPoint>]]
-    local num = 1
-    -- Add our own first section for first page, with page name as title
-    table.insert(toc_ncx_parts, string.format([[<navPoint id="navpoint-%s" playOrder="%s"><navLabel><text>%s</text></navLabel><content src="content.html"/>]], num, num, page_htmltitle))
-    table.insert(toc_ncx_parts, np_end)
-    --- @todo Not essential for most articles, but longer articles might benefit
-    -- from parsing <h*> tags and constructing a proper TOC
-    while cur_level > 0 do
-        table.insert(toc_ncx_parts, np_end)
-        cur_level = cur_level - 1
+    local num = 0
+
+    for index, chapter in ipairs(chapters) do
+        -- Add nav part for each chapter.
+        table.insert(
+            toc_ncx_parts,
+            string.format([[<navPoint id="navpoint-%s" playOrder="%s"><navLabel><text>%s</text></navLabel><content src="%s.html"/></navPoint>]],
+                num,
+                num,
+                chapter.title,
+                chapter.md5
+            )
+        )
+        num = num + 1
    end
-    -- Prepend NCX head
-    table.insert(toc_ncx_parts, 1, string.format([[
+    -- Prepend NCX head.
+    table.insert(
+        toc_ncx_parts,
+        1,
+        string.format([[
 <?xml version='1.0' encoding='utf-8'?>
 <!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
 <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
@@ -433,99 +331,172 @@ function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, me
    <text>%s</text>
  </docTitle>
  <navMap>
-]], bookid, depth, page_htmltitle))
-    -- Append NCX tail
-    table.insert(toc_ncx_parts, [[
+]],
+"placeholder_bookid",
+depth,
+self.title
+        )
+    )
+    -- Append NCX tail.
+    table.insert(
+        toc_ncx_parts,
+        [[
  </navMap>
 </ncx>
-]])
-    epub:add("OEBPS/toc.ncx", table.concat(toc_ncx_parts))
-    logger.dbg("Added OEBPS/toc.ncx")
-
-    -- ----------------------------------------------------------------
-    -- OEBPS/content.html
-    epub:add("OEBPS/content.html", html)
-    logger.dbg("Added OEBPS/content.html")
-
-    -- Force a GC to free the memory we used till now (the second call may
-    -- help reclaim more memory).
-    collectgarbage()
-    collectgarbage()
-
-    -- ----------------------------------------------------------------
-    -- OEBPS/images/*
-    if include_images then
-        local nb_images = #images
-        for inum, img in ipairs(images) do
-            -- Process can be interrupted at this point between each image download
-            -- by tapping while the InfoMessage is displayed
-            -- We use the fast_refresh option from image #2 for a quicker download
-            local go_on = UI:info(T(_("%1\n\nRetrieving image %2 / %3 …"), message, inum, nb_images), inum >= 2)
-            if not go_on then
-                logger.dbg("cancelled")
-                cancelled = true
-                break
-            end
-            local src = img.src
-            if use_img_2x and img.src2x then
-                src = img.src2x
-            end
-            logger.dbg("Getting img ", src)
-            local success, content = getUrlContent(src)
-            -- success, content = getUrlContent(src..".unexistant") -- to simulate failure
-            if success then
-                logger.dbg("success, size:", #content)
-            else
-                logger.dbg("failed fetching:", src)
-            end
-            if success then
-                -- Images do not need to be compressed, so spare some cpu cycles
-                local no_compression = true
-                if img.mimetype == "image/svg+xml" then -- except for SVG images (which are XML text)
-                    no_compression = false
-                end
-                epub:add("OEBPS/"..img.imgpath, content, no_compression)
-                logger.dbg("Adding OEBPS/"..img.imgpath)
-            else
-                go_on = UI:confirm(T(_("Downloading image %1 failed. Continue anyway?"), inum), _("Stop"), _("Continue"))
-                if not go_on then
-                    cancelled = true
-                    break
-                end
-            end
-        end
-    end
-
-    -- Done with adding files
-    if cancelled then
-        if UI:confirm(_("Download did not complete.\nDo you want to create an EPUB with the already downloaded images?"), _("Don't create"), _("Create")) then
-            cancelled = false
-        end
-    end
-    if cancelled then
-        UI:info(_("Canceled. Cleaning up…"))
-    else
-        UI:info(T(_("%1\n\nPacking EPUB…"), message))
-    end
-    epub:close()
-
-    if cancelled then
-        -- Build was cancelled, remove half created .epub
-        if lfs.attributes(epub_path_tmp, "mode") == "file" then
-            os.remove(epub_path_tmp)
-        end
-        return false
-    end
-
-    -- Finally move the .tmp to the final file
-    os.rename(epub_path_tmp, epub_path)
-    logger.dbg("successfully created:", epub_path)
-
-    -- Force a GC to free the memory we used (the second call may help
-    -- reclaim more memory).
-    collectgarbage()
-    collectgarbage()
-    return true
+]]
+    )
+    self.ncx_toc = toc_ncx_parts
 end

-return EpubDownloadBackend
+function EpubBuilder:addManifest(chapters, images)
+    local content_opf_parts = {}
+    local spine_parts = {}
+    local meta_cover = "<!-- no cover image -->"
+
+    if #images > 0 then
+        for inum, image in ipairs(images) do
+            table.insert(
+                content_opf_parts,
+                string.format([[<item id="%s" href="%s" media-type="%s"/>%s]],
+                    image.imgid,
+                    image.imgpath,
+                    image.mimetype,
+                    "\n"
+                )
+            )
+            -- See if the image has the tag we previously set indicating
+            -- it can be used as a cover image.
+            if image.cover_image then
+                meta_cover = string.format([[<meta name="cover" content="%s"/>]], image.imgid)
+            end
+        end
+    end
+
+    if #chapters > 0 then
+        for index, chapter in ipairs(chapters) do
+            table.insert(
+                content_opf_parts,
+                string.format([[<item id="%s" href="%s.html" media-type="application/xhtml+xml"/>%s]],
+                    chapter.md5,
+                    chapter.md5,
+                    "\n"
+                )
+            )
+            table.insert(
+                spine_parts,
+                string.format([[<itemref idref="%s"/>%s]],
+                    chapter.md5,
+                    "\n"
+                )
+            )
+        end
+    end
+
+    logger.dbg("meta_cover:", meta_cover)
+
+    table.insert(
+        content_opf_parts,
+        1,
+        string.format([[<?xml version='1.0' encoding='utf-8'?>
+<package xmlns="http://www.idpf.org/2007/opf"
+        xmlns:dc="http://purl.org/dc/elements/1.1/"
+        unique-identifier="bookid" version="2.0">
+  <metadata>
+    <dc:title>%s</dc:title>
+    <dc:publisher>KOReader %s</dc:publisher>
+    %s
+  </metadata>
+  <manifest>
+    <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
+]], self.title, Version:getCurrentRevision(), meta_cover)
+    )
+    -- tail
+    table.insert(
+        content_opf_parts,
+        string.format([[
+  </manifest>
+  <spine toc="ncx">
+%s
+  </spine>
+</package>
+]], table.concat(spine_parts)
+        )
+    )
+
+    self.ncx_manifest = content_opf_parts
+end
+
+function EpubBuilder:addContents(chapters)
+    local contents = {}
+
+    for index, chapter in ipairs(chapters) do
+        table.insert(
+            contents,
+            {
+                filename = chapter.md5 .. ".html",
+                html = chapter.html,
+            }
+        )
+    end
+
+    self.ncx_contents = contents
+end
+
+function EpubBuilder:addImages(images)
+    local images_table = {}
+
+    for index, image in ipairs(images) do
+        if not image.src then
+            return
+        end
+
+        local src = image.src
+        local success, content = NewsHelpers:getUrlContent(src)
+        -- success, content = NewsHelpers:getUrlContent(src..".unexistant") -- to simulate failure
+        if success then
+            logger.dbg("EpubBuilder:addImages = success, size:", #content)
+        else
+            logger.dbg("EpubBuilder:addImages = failure fetching:", src)
+        end
+
+        if success then
+            -- Images do not need to be compressed, so spare some cpu cycles
+            local no_compression = true
+            if image.mimetype == "image/svg+xml" then -- except for SVG images (which are XML text)
+                no_compression = false
+            end
+            table.insert(
+                images_table,
+                {
+                    path = image.imgpath,
+                    content = content,
+                    compression = no_compression
+                }
+            )
+        end
+    end
+
+    self.ncx_images = images_table
+
+end
+
+-- There can be multiple links.
+-- For now we just assume the first link is probably the right one.
+--- @todo Write unit tests.
+-- Some feeds that can be used for unit test.
+-- http://fransdejonge.com/feed/ for multiple links.
+-- https://github.com/koreader/koreader/commits/master.atom for single link with attributes.
+function EpubBuilder:getFeedLink(possible_link)
+    local E = {}
+    logger.dbg("Possible link", possible_link)
+    if type(possible_link) == "string" then
+        return possible_link
+    elseif (possible_link._attr or E).href then
+        return possible_link._attr.href
+    elseif ((possible_link[1] or E)._attr or E).href then
+        return possible_link[1]._attr.href
+    end
+end
+
+
+return EpubBuilder
--- a/plugins/newsdownloader.koplugin/feed_source.lua
+++ b/plugins/newsdownloader.koplugin/feed_source.lua
@@ -0,0 +1,398 @@
+local BD = require("ui/bidi")
+local DownloadBackend = require("epubdownloadbackend")
+local NewsHelpers = require("http_utilities")
+local dateparser = require("lib.dateparser")
+local logger = require("logger")
+local md5 = require("ffi/sha2").md5
+local util = require("util")
+local _ = require("gettext")
+local N_ = _.ngettext
+local FFIUtil = require("ffi/util")
+local T = FFIUtil.template
+
+local FeedSource = {
+    file_extension = ".epub"
+}
+
+function FeedSource:new(o)
+    o = o or {}
+    self.__index = self
+    setmetatable(o, self)
+    return o
+end
+
+function FeedSource:getInitializedFeeds(feed_list, progress_callback, error_callback)
+    local initialized_feeds = {}
+    local unsupported_feeds_urls = {}
+
+    for idx, feed in ipairs(feed_list) do
+        local url = feed[1]
+        -- Show a UI update
+        progress_callback(T(
+            _("Setting up feed %1 of %2."),
+            idx,
+            url
+        ))
+        -- Initialize the feed
+        local ok, response = pcall(function()
+            return self:initializeDocument(
+                self:fetchDocumentByUrl(url)
+            )
+        end)
+        -- If the initialization worked, add the feed
+        -- to a list of initialized feeds
+        if ok and response then
+            table.insert(initialized_feeds, {
+                config = feed,
+                document = response,
+            })
+        else
+            table.insert(unsupported_feeds_urls, {
+                url .. ": " .. response
+            })
+        end
+    end
+
+    if #unsupported_feeds_urls > 0 then
+        -- When some errors are present, we get a sour message that includes
+        -- information about the source of the error.
+        local unsupported_urls = ""
+        for key, value in pairs(unsupported_feeds_urls) do
+            -- Create the error message.
+            --            unsupported_urls = unsupported_urls .. " " .. value[1] .. " " .. value[2]
+            unsupported_urls = value[1] .. "\n\n"
+            -- Not sure what this does.
+            if key ~= #unsupported_feeds_urls then
+                unsupported_urls = BD.url(unsupported_urls) .. ", "
+            end
+        end
+        error_callback(
+            T(N_("Could not initialize a feed:\n\n%2\n\nPlease review your feed configuration.", "Could not initialize %1 feeds:\n\n%2\n\nPlease review your feed configurations.", #unsupported_feeds_urls),
+                #unsupported_feeds_urls, unsupported_urls)
+        )
+    end
+
+    return initialized_feeds
+end
+
+-- This function contacts the feed website and attempts to get
+-- the RSS/Atom document with a list of the latest items.
+function FeedSource:fetchDocumentByUrl(url)
+    local document
+    -- Get the XML document representing the feed
+    local ok, response = pcall(function()
+            local success, content = NewsHelpers:getUrlContent(url)
+            if (success) then
+                return content
+            else
+                error("Failed to download content for url: " .. url, 0)
+            end
+    end)
+    -- Check to see if a response is available to deserialize.
+    if ok then
+        -- Deserialize the XML document into something Lua can use
+        document = NewsHelpers:deserializeXMLString(response)
+    end
+    -- Return the document or any errors that may have occured
+    if ok or document then
+        return document
+    else
+        if not ok then
+            error("(Reason: Failed to download feed document)", 0)
+        else
+            error("(Reason: Error during feed document deserialization)", 0)
+        end
+    end
+end
+
+-- Supply this method with the XML document returned by the feed,
+-- and it will initialized the document by extracting the feed title,
+-- feed items, and items count.
+function FeedSource:initializeDocument(document)
+    local feed_title
+    local feed_items
+    local total_items
+
+    local ok = pcall(function()
+            return self:getFeedType(
+                document,
+                function()
+                    -- RSS callback
+                    feed_title = util.htmlEntitiesToUtf8(document.rss.channel.title)
+                    feed_items = document.rss.channel.item
+                    total_items = #document.rss.channel.item
+                end,
+                function()
+                    -- Atom callback
+                    feed_title = FeedSource:getFeedTitle(document.feed.title)
+                    feed_items = document.feed.entry
+                    total_items = #document.feed.entry
+                end
+            )
+    end)
+
+    if ok then
+        document.title = feed_title
+        document.items = feed_items
+        document.total_items = total_items
+        return document
+    else
+        error(_("Could not initialize feed document"), 0)
+    end
+end
+
+function FeedSource:getItemsContent(feed, progress_callback, error_callback)
+    local limit = tonumber(feed.config.limit)
+    local total_items = (limit == 0) and
+        feed.document.total_items or
+        limit
+    local initialized_feed_items = {}
+    -- Download each ite0m in the feed
+    for index, item in pairs(feed.document.items) do
+        -- If limit has been met, stop downloading feed.
+        if limit ~= 0 and index - 1 == limit then
+            break
+        end
+        -- Display feedback to user.
+        progress_callback(T(
+            _("%3\n Downloading item %1 of %2"),
+            index,
+            total_items,
+            feed.document.title
+        ))
+        -- Download the article's HTML.
+        local ok, response = pcall(function()
+                return self:initializeItemHtml(
+                    feed,
+                    self:getItemHtml(
+                        item,
+                        feed.config.download_full_article
+                    )
+                )
+        end)
+
+        -- Add the result to our table, or send a
+        -- result to the error callback.
+        if ok then
+            table.insert(initialized_feed_items, {
+                html = response.html,
+                images = response.images,
+                item_slug = FeedSource:getItemTitleWithDate(item),
+                item_title = item.title,
+                md5 = md5(item.title),
+                feed_title = feed.document.title,
+            })
+        else
+            error_callback(
+                T(_("Could not get content for: %1"), feed.document.title)
+            )
+        end
+
+    end
+
+    if #initialized_feed_items > 0 then
+        return initialized_feed_items
+    else
+        return nil
+    end
+end
+
+function FeedSource:initializeItemHtml(feed, html)
+    local url = feed.config[1]
+    -- local download_full_article = feed.config.download_full_article ~= false
+    local include_images = feed.config.include_images ~= false
+    local filter_element = feed.config.filter_element or
+        feed.config.filter_element == nil
+    local enable_filter = feed.config.enable_filter ~= false
+    local item_images, item_html = DownloadBackend:getImagesAndHtml(
+        html,
+        url,
+        include_images,
+        enable_filter,
+        filter_element
+    )
+    return {
+        html = item_html,
+        images = item_images
+    }
+end
+
+function FeedSource:getFeedType(document, rss_cb, atom_cb)
+    -- Check to see if the feed uses RSS.
+    local is_rss = document.rss and
+        document.rss.channel and
+        document.rss.channel.title and
+        document.rss.channel.item and
+        document.rss.channel.item[1] and
+        document.rss.channel.item[1].title and
+        document.rss.channel.item[1].link
+    -- Check to see if the feed uses Atom.
+    local is_atom = document.feed and
+        document.feed.title and
+        document.feed.entry[1] and
+        document.feed.entry[1].title and
+        document.feed.entry[1].link
+    -- Setup the feed values based on feed type
+    if is_atom then
+        return atom_cb()
+    elseif is_rss then
+        return rss_cb()
+    end
+    -- Return the values through our callback, or call an
+    -- error message if the feed wasn't RSS or Atom
+    if not is_rss or not is_atom then
+        local error_message
+        if not is_rss then
+            error_message = _("(Reason: Couldn't process RSS)")
+        elseif not is_atom then
+            error_message = _("(Reason: Couldn't process Atom)")
+        end
+        error(error_message)
+    end
+end
+
+function FeedSource:getItemHtml(item, download_full_article)
+    if download_full_article then
+        return NewsHelpers:loadPage(
+            FeedSource:getFeedLink(item.link)
+        )
+    else
+        local feed_description = item.description or item.summary
+        local footer = _("This is just a description of the feed. To download the full article instead, go to the News Downloader settings and change 'download_full_article' to 'true'.")
+        return string.format([[<!DOCTYPE html>
+<html>
+<head><meta charset='UTF-8'><title>%s</title></head>
+<body><header><h2>%s</h2></header><article>%s</article>
+<br><footer><small>%s</small></footer>
+</body>
+</html>]], item.title, item.title, feed_description, footer)
+    end
+end
+
+-- @todo: move this elsewhere
+function FeedSource:getEpubOutputDir(download_dir, sub_dir, epub_title)
+
+    local feed_output_dir = ("%s%s/"):format(
+        download_dir,
+        util.getSafeFilename(util.htmlEntitiesToUtf8(sub_dir)))
+
+    -- Create the output directory if it doesn't exist.
+    if not lfs.attributes(feed_output_dir, "mode") then
+        lfs.mkdir(feed_output_dir)
+    end
+
+    local file_name = FeedSource:getFeedTitle(epub_title)
+
+    return ("%s%s%s"):format(
+        feed_output_dir,
+        file_name,
+        self.file_extension
+    )
+end
+
+function FeedSource:createEpub(title, chapters, abs_output_path, progress_callback, error_callback)
+
+    local file_exists = lfs.attributes(abs_output_path, "mode")
+
+    if file_exists then
+        logger.dbg("NewsDownloader: Skipping. EPUB file already exists", abs_output_path)
+        return true
+    end
+
+    if #chapters == 0 then
+        error(_("Error: chapters contains 0 items"), 0)
+    end
+
+    local images = {}
+
+    for index, chapter in ipairs(chapters) do
+        for jndex, image in ipairs(chapter.images) do
+            table.insert(
+                images,
+                image
+            )
+        end
+    end
+
+    local epub = DownloadBackend:new{}
+
+    progress_callback(T(_("Building EPUB %1"), title))
+    epub:setTitle(title)
+    epub:addToc(chapters)
+    epub:addManifest(chapters, images)
+
+    progress_callback(T(_("Building EPUB %1: %2"), title, _("Adding contents")))
+    epub:addContents(chapters)
+
+    progress_callback(T(_("Building EPUB %1: %2"), title, _("Adding images")))
+    epub:addImages(images)
+
+    progress_callback(T(_("Building EPUB %1: %2"), title, _("Writing EPUB to disk")))
+    local ok = pcall(function()
+        return epub:build(abs_output_path)
+    end)
+
+    if ok then
+        if lfs.attributes(abs_output_path, "mode") then
+            return true
+        end
+    end
+
+    return false
+end
+
+local function parseDate(dateTime)
+    -- Uses lua-feedparser https://github.com/slact/lua-feedparser
+    -- feedparser is available under the (new) BSD license.
+    -- see: koreader/plugins/newsdownloader.koplugin/lib/LICENCE_lua-feedparser
+    local date = dateparser.parse(dateTime)
+    return os.date("%y-%m-%d_%H-%M_", date)
+end
+
+function FeedSource:getFeedTitleWithDate(feed)
+    local title = util.getSafeFilename(FeedSource:getFeedTitle(feed.document.title))
+    return os.date("%y-%m-%d_%H-%M_") .. title
+end
+
+-- Creates a title with date from a feed item.
+function FeedSource:getItemTitleWithDate(item)
+    local title = util.getSafeFilename(FeedSource:getFeedTitle(item.title))
+    if item.updated then
+        title = parseDate(item.updated) .. title
+    elseif item.pubDate then
+        title = parseDate(item.pubDate) .. title
+    elseif item.published then
+        title = parseDate(item.published) .. title
+    end
+    return title
+end
+
+-- If a title looks like <title>blabla</title> it'll just be feed.title.
+-- If a title looks like <title attr="alb">blabla</title> then we get a table
+-- where [1] is the title string and the attributes are also available.
+function FeedSource:getFeedTitle(possible_title)
+    if type(possible_title) == "string" then
+        return util.htmlEntitiesToUtf8(possible_title)
+    elseif possible_title[1] and type(possible_title[1]) == "string" then
+        return util.htmlEntitiesToUtf8(possible_title[1])
+    end
+end
+-- There can be multiple links.
+-- For now we just assume the first link is probably the right one.
+--- @todo Write unit tests.
+-- Some feeds that can be used for unit test.
+-- http://fransdejonge.com/feed/ for multiple links.
+-- https://github.com/koreader/koreader/commits/master.atom for single link with attributes.
+function FeedSource:getFeedLink(possible_link)
+    local E = {}
+    if type(possible_link) == "string" then
+        return possible_link
+    elseif (possible_link._attr or E).href then
+        return possible_link._attr.href
+    elseif ((possible_link[1] or E)._attr or E).href then
+        return possible_link[1]._attr.href
+    end
+end
+
+
+return FeedSource
--- a/plugins/newsdownloader.koplugin/feed_view.lua
+++ b/plugins/newsdownloader.koplugin/feed_view.lua
@@ -7,7 +7,10 @@ local FeedView = {
    DOWNLOAD_FULL_ARTICLE = "download_full_article",
    INCLUDE_IMAGES = "include_images",
    ENABLE_FILTER = "enable_filter",
-    FILTER_ELEMENT = "filter_element"
+    FILTER_ELEMENT = "filter_element",
+    VOLUMIZE = "volumize",
+    ACTION_RESET_HISTORY = "reset_history",
+    ACTION_DELETE_FEED = "delete_feed",
 }

 function FeedView:getList(feed_config, callback, edit_feed_attribute_callback, delete_feed_callback)
@@ -49,7 +52,7 @@ function FeedView:getList(feed_config, callback, edit_feed_attribute_callback, d
    return view_content
 end

-function FeedView:getItem(id, feed, edit_feed_callback, delete_feed_callback)
+function FeedView:getItem(id, feed, edit_feed_callback, feed_action_callback)

    logger.dbg("NewsDownloader:", feed)

@@ -67,6 +70,7 @@ function FeedView:getItem(id, feed, edit_feed_callback, delete_feed_callback)
    local include_images = feed.include_images ~= false
    local enable_filter = feed.enable_filter ~= false
    local filter_element = feed.filter_element
+    local volumize = feed.volumize ~= false

    local vc = {
        {
@@ -136,11 +140,22 @@ function FeedView:getItem(id, feed, edit_feed_callback, delete_feed_callback)
                )
            end
        },
+        {
+            _("Volumize feed"),
+            volumize,
+            callback = function()
+                edit_feed_callback(
+                    id,
+                    FeedView.VOLUMIZE,
+                    volumize
+                )
+            end
+        },
    }

-    -- We don't always display this. For instance: if a feed
-    -- is being created, this button is not necessary.
-    if delete_feed_callback then
+    -- These actions only pertain to initiated feeds, so we don't always
+    -- display them.
+    if feed_action_callback then
        table.insert(
            vc,
            "---"
@@ -151,8 +166,22 @@ function FeedView:getItem(id, feed, edit_feed_callback, delete_feed_callback)
                _("Delete feed"),
                "",
                callback = function()
-                    delete_feed_callback(
-                        id
+                    feed_action_callback(
+                        id,
+                        FeedView.ACTION_DELETE_FEED
+                    )
+                end
+            }
+        )
+        table.insert(
+            vc,
+            {
+                _("Reset feed history"),
+                "",
+                callback = function()
+                    feed_action_callback(
+                        url,
+                        FeedView.ACTION_RESET_HISTORY
                    )
                end
            }
--- a/plugins/newsdownloader.koplugin/http_utilities.lua
+++ b/plugins/newsdownloader.koplugin/http_utilities.lua
@@ -0,0 +1,126 @@
+local logger = require("logger")
+local http = require("socket.http")
+local socketutil = require("socketutil")
+local socket_url = require("socket.url")
+local socket = require("socket")
+local ltn12 = require("ltn12")
+
+local NewsHelpers = {
+}
+
+local max_redirects = 5; --prevent infinite redirects
+
+-- Get URL content
+function NewsHelpers:getUrlContent(url, timeout, maxtime, redirectCount)
+    logger.dbg("getUrlContent(", url, ",", timeout, ",", maxtime, ",", redirectCount, ")")
+    if not redirectCount then
+        redirectCount = 0
+    elseif redirectCount == max_redirects then
+        error("EpubDownloadBackend: reached max redirects: ", redirectCount)
+    end
+
+    if not timeout then timeout = 10 end
+    logger.dbg("timeout:", timeout)
+
+    local sink = {}
+    local parsed = socket_url.parse(url)
+    socketutil:set_timeout(timeout, maxtime or 30)
+    local request = {
+        url     = url,
+        method  = "GET",
+        sink    = maxtime and socketutil.table_sink(sink) or ltn12.sink.table(sink),
+    }
+    logger.dbg("request:", request)
+    local code, headers, status = socket.skip(1, http.request(request))
+    socketutil:reset_timeout()
+    logger.dbg("After http.request")
+    local content = table.concat(sink) -- empty or content accumulated till now
+    logger.dbg("type(code):", type(code))
+    logger.dbg("code:", code)
+    logger.dbg("headers:", headers)
+    logger.dbg("status:", status)
+    logger.dbg("#content:", #content)
+
+    if code == socketutil.TIMEOUT_CODE or
+       code == socketutil.SSL_HANDSHAKE_CODE or
+       code == socketutil.SINK_TIMEOUT_CODE
+    then
+        logger.warn("request interrupted:", code)
+        return false, code
+    end
+    if headers == nil then
+        logger.warn("No HTTP headers:", code, status)
+        return false, "Network or remote server unavailable"
+    end
+    if not code or string.sub(code, 1, 1) ~= "2" then -- all 200..299 HTTP codes are OK
+        if code and code > 299 and code < 400  and headers and headers.location then -- handle 301, 302...
+            local redirected_url = headers.location
+            local parsed_redirect_location = socket_url.parse(redirected_url)
+            if not parsed_redirect_location.host then
+                parsed_redirect_location.host = parsed.host
+                parsed_redirect_location.scheme = parsed.scheme
+                redirected_url = socket_url.build(parsed_redirect_location)
+            end
+            logger.dbg("getUrlContent: Redirecting to url: ", redirected_url)
+            return self:getUrlContent(redirected_url, timeout, maxtime, redirectCount + 1)
+        else
+            --            error("EpubDownloadBackend: Don't know how to handle HTTP response status: " .. status)
+            --            error("EpubDownloadBackend: Don't know how to handle HTTP response status.")
+            logger.warn("HTTP status not okay:", code, status)
+            return false, status
+        end
+    end
+    if headers and headers["content-length"] then
+        -- Check we really got the announced content size
+        local content_length = tonumber(headers["content-length"])
+        if #content ~= content_length then
+            return false, "Incomplete content received"
+        end
+    end
+    logger.dbg("Returning content ok")
+    return true, content
+end
+
+function NewsHelpers:loadPage(url)
+    logger.dbg("Load page: ", url)
+    local success, content
+--[[    if self.trap_widget then -- if previously set with EpubDownloadBackend:setTrapWidget()
+        local Trapper = require("ui/trapper")
+        local timeout, maxtime = 30, 60
+        -- We use dismissableRunInSubprocess with complex return values:
+        completed, success, content = Trapper:dismissableRunInSubprocess(function()
+            return NewsHelpers:getUrlContent(url, timeout, maxtime)
+        end, self.trap_widget)
+        if not completed then
+            error(self.dismissed_error_code) -- "Interrupted by user"
+        end
+    else]]--
+    local timeout, maxtime = 10, 60
+    success, content = NewsHelpers:getUrlContent(url, timeout, maxtime)
+--    end
+    logger.dbg("success:", success, "type(content):", type(content), "content:", content:sub(1, 500), "...")
+    if not success then
+        error(content)
+    else
+        return content
+    end
+end
+
+function NewsHelpers:deserializeXMLString(xml_str)
+    -- uses LuaXML https://github.com/manoelcampos/LuaXML
+    -- The MIT License (MIT)
+    -- Copyright (c) 2016 Manoel Campos da Silva Filho
+    -- see: koreader/plugins/newsdownloader.koplugin/lib/LICENSE_LuaXML
+    local treehdl = require("lib/handler")
+    local libxml = require("lib/xml")
+    -- Instantiate the object that parses the XML file as a Lua table.
+    local xmlhandler = treehdl.simpleTreeHandler()
+    -- Instantiate the object that parses the XML to a Lua table.
+    local ok = pcall(function()
+            libxml.xmlParser(xmlhandler):parse(xml_str)
+    end)
+    if not ok then return end
+    return xmlhandler.root
+end
+
+return NewsHelpers
--- a/plugins/newsdownloader.koplugin/main.lua
+++ b/plugins/newsdownloader.koplugin/main.lua