From 56c08020d7080b974f41ea976f1e494489f175ef Mon Sep 17 00:00:00 2001 From: Frans de Jonge Date: Sun, 2 Feb 2025 13:34:38 +0100 Subject: [PATCH] NewsDownloader: process HTML with cre.getBalancedHTML() to ensure self-closing tags like
are closed like
(#13188) Works around the issue in . --- plugins/newsdownloader.koplugin/epubdownloadbackend.lua | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/plugins/newsdownloader.koplugin/epubdownloadbackend.lua b/plugins/newsdownloader.koplugin/epubdownloadbackend.lua index 85d60c637..81576eb19 100644 --- a/plugins/newsdownloader.koplugin/epubdownloadbackend.lua +++ b/plugins/newsdownloader.koplugin/epubdownloadbackend.lua @@ -328,6 +328,14 @@ function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, me local cancelled = false local page_htmltitle = html:match([[]*>(.-)]]) logger.dbg("page_htmltitle is ", page_htmltitle) + + -- Rejigger HTML into XHTML to avoid unclosed elements. See . + local cre = require("libs/libkoreader-cre") + html = cre.getBalancedHTML(html, 0x0) + + -- Remove all script tags to save a few bytes. + html = html:gsub(".-", "") + -- local sections = html.sections -- Wikipedia provided TOC local bookid = "bookid_placeholder" --string.format("wikipedia_%s_%s_%s", lang, phtml.pageid, phtml.revid) -- Not sure if this bookid may ever be used by indexing software/calibre, but if it is,