diff --git a/plugins/newsdownloader.koplugin/epubdownloadbackend.lua b/plugins/newsdownloader.koplugin/epubdownloadbackend.lua index cbc9dce3a..2bde05fe7 100644 --- a/plugins/newsdownloader.koplugin/epubdownloadbackend.lua +++ b/plugins/newsdownloader.koplugin/epubdownloadbackend.lua @@ -24,6 +24,54 @@ local max_redirects = 5; --prevent infinite redirects local TIMEOUT_CODE = "timeout" -- from socket.lua local MAXTIME_CODE = "maxtime reached" -- from sink_table_with_maxtime +-- filter HTML using CSS selector +local function filter(text, element) + local htmlparser = require("htmlparser") + local root = htmlparser.parse(text, 5000) + local filtered = nil + local selectors = { + "main", + "article", + "div#main", + "#main-article", + ".main-content", + "#body", + "#content", + ".content", + "div#article", + "div.article", + "div.post", + "div.post-outer", + ".l-root", + ".content-container", + ".StandardArticleBody_body", + "div#article-inner", + "div#newsstorytext", + "div.general", + } + if element then + table.insert(selectors, 1, element) + end + for _, sel in ipairs(selectors) do + local elements = root:select(sel) + if elements then + for _, e in ipairs(elements) do + filtered = e:getcontent() + if filtered then + break + end + end + if filtered then + break + end + end + end + if not filtered then + return text + end + return "" .. filtered .. "" +end + -- Sink that stores into a table, aborting if maxtime has elapsed local function sink_table_with_maxtime(t, maxtime) -- Start counting as soon as this sink is created @@ -181,15 +229,13 @@ local ext_to_mimetype = { ttf = "application/truetype", woff = "application/font-woff", } - -- Create an epub file (with possibly images) -function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, message) +function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, message, filter_enable, filter_element) logger.dbg("EpubDownloadBackend:createEpub(", epub_path, ")") -- Use Trapper to display progress and ask questions through the UI. -- We need to have been Trapper.wrap()'ed for UI to be used, otherwise -- Trapper:info() and Trapper:confirm() will just use logger. local UI = require("ui/trapper") - -- We may need to build absolute urls for non-absolute links and images urls local base_url = socket_url.parse(url) @@ -201,7 +247,7 @@ function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, me -- Not sure if this bookid may ever be used by indexing software/calibre, but if it is, -- should it changes if content is updated (as now, including the wikipedia revisionId), -- or should it stays the same even if revid changes (content of the same book updated). - + if filter_enable then html = filter(html, filter_element) end local images = {} local seen_images = {} local imagenum = 1 diff --git a/plugins/newsdownloader.koplugin/feed_config.lua b/plugins/newsdownloader.koplugin/feed_config.lua index 1c17ced8f..9e864acfb 100644 --- a/plugins/newsdownloader.koplugin/feed_config.lua +++ b/plugins/newsdownloader.koplugin/feed_config.lua @@ -21,12 +21,19 @@ return {--do NOT change this line -- 'include_images=false' - means ignore any images, only download the text (faster download, smaller file sizes) -- default value is 'false' (if no 'include_images' entry) + -- 'enable_filter=true' - means filter using a CSS selector to delimit part of the page to just that (does not apply if download_full_article=false) + -- 'enable_filter=false' - means no such filtering and including the full page + -- default value is 'false' + + -- 'filter_element="name_of_css.element.class" - means to filter the chosen CSS selector, it can be easily picked using a modern web browser + -- The default value is empty. The default list of common selectors is used as fallback if this value is set. + -- comment out line ("--" at line start) to stop downloading source -- LIST YOUR FEEDS HERE: - { "http://feeds.reuters.com/Reuters/worldNews?format=xml", limit = 2, download_full_article=true}, + { "http://feeds.reuters.com/Reuters/worldNews?format=xml", limit = 2, download_full_article=true, include_images=true, enable_filter=true}, { "https://www.pcworld.com/index.rss", limit = 7 , download_full_article=false}, diff --git a/plugins/newsdownloader.koplugin/main.lua b/plugins/newsdownloader.koplugin/main.lua index ead943698..a5d3ec472 100644 --- a/plugins/newsdownloader.koplugin/main.lua +++ b/plugins/newsdownloader.koplugin/main.lua @@ -199,10 +199,12 @@ function NewsDownloader:loadConfigAndProcessFeeds() local limit = feed.limit local download_full_article = feed.download_full_article == nil or feed.download_full_article local include_images = not never_download_images and feed.include_images + local enable_filter = feed.enable_filter or feed.enable_filter == nil + local filter_element = feed.filter_element or feed.filter_element == nil if url and limit then local feed_message = T(_("Processing %1/%2:\n%3"), idx, total_feed_entries, BD.url(url)) UI:info(feed_message) - NewsDownloader:processFeedSource(url, tonumber(limit), unsupported_feeds_urls, download_full_article, include_images, feed_message) + NewsDownloader:processFeedSource(url, tonumber(limit), unsupported_feeds_urls, download_full_article, include_images, feed_message, enable_filter, filter_element) else logger.warn('NewsDownloader: invalid feed config entry', feed) end @@ -230,7 +232,7 @@ function NewsDownloader:loadConfigAndProcessFeedsWithUI() end) end -function NewsDownloader:processFeedSource(url, limit, unsupported_feeds_urls, download_full_article, include_images, message) +function NewsDownloader:processFeedSource(url, limit, unsupported_feeds_urls, download_full_article, include_images, message, enable_filter, filter_element) local ok, response = pcall(function() return DownloadBackend:getResponseAsString(url) @@ -250,11 +252,11 @@ function NewsDownloader:processFeedSource(url, limit, unsupported_feeds_urls, do if is_atom then ok = pcall(function() - return self:processAtom(feeds, limit, download_full_article, include_images, message) + return self:processAtom(feeds, limit, download_full_article, include_images, message, enable_filter, filter_element) end) elseif is_rss then ok = pcall(function() - return self:processRSS(feeds, limit, download_full_article, include_images, message) + return self:processRSS(feeds, limit, download_full_article, include_images, message, enable_filter, filter_element) end) end if not ok or (not is_rss and not is_atom) then @@ -280,7 +282,7 @@ function NewsDownloader:deserializeXMLString(xml_str) return xmlhandler.root end -function NewsDownloader:processAtom(feeds, limit, download_full_article, include_images, message) +function NewsDownloader:processAtom(feeds, limit, download_full_article, include_images, message, enable_filter, filter_element) local feed_output_dir = string.format("%s%s/", news_download_dir_path, util.getSafeFilename(getFeedTitle(feeds.feed.title))) @@ -294,14 +296,14 @@ function NewsDownloader:processAtom(feeds, limit, download_full_article, include end local article_message = T(_("%1\n\nFetching article %2/%3:"), message, index, limit == 0 and #feeds.rss.channel.item or limit) if download_full_article then - self:downloadFeed(feed, feed_output_dir, include_images, article_message) + self:downloadFeed(feed, feed_output_dir, include_images, article_message, enable_filter, filter_element) else self:createFromDescription(feed, feed.content[1], feed_output_dir, include_images, article_message) end end end -function NewsDownloader:processRSS(feeds, limit, download_full_article, include_images, message) +function NewsDownloader:processRSS(feeds, limit, download_full_article, include_images, message, enable_filter, filter_element) local feed_output_dir = ("%s%s/"):format( news_download_dir_path, util.getSafeFilename(util.htmlEntitiesToUtf8(feeds.rss.channel.title))) if not lfs.attributes(feed_output_dir, "mode") then @@ -314,7 +316,7 @@ function NewsDownloader:processRSS(feeds, limit, download_full_article, include_ end local article_message = T(_("%1\n\nFetching article %2/%3:"), message, index, limit == 0 and #feeds.rss.channel.item or limit) if download_full_article then - self:downloadFeed(feed, feed_output_dir, include_images, article_message) + self:downloadFeed(feed, feed_output_dir, include_images, article_message, enable_filter, filter_element) else self:createFromDescription(feed, feed.description, feed_output_dir, include_images, article_message) end @@ -341,7 +343,7 @@ local function getTitleWithDate(feed) return title end -function NewsDownloader:downloadFeed(feed, feed_output_dir, include_images, message) +function NewsDownloader:downloadFeed(feed, feed_output_dir, include_images, message, enable_filter, filter_element) local title_with_date = getTitleWithDate(feed) local news_file_path = ("%s%s%s"):format(feed_output_dir, title_with_date, @@ -355,7 +357,7 @@ function NewsDownloader:downloadFeed(feed, feed_output_dir, include_images, mess local article_message = T(_("%1\n%2"), message, title_with_date) local link = getFeedLink(feed.link) local html = DownloadBackend:loadPage(link) - DownloadBackend:createEpub(news_file_path, html, link, include_images, article_message) + DownloadBackend:createEpub(news_file_path, html, link, include_images, article_message, enable_filter, filter_element) end end