From 56c08020d7080b974f41ea976f1e494489f175ef Mon Sep 17 00:00:00 2001
From: Frans de Jonge <fransdejonge@gmail.com>
Date: Sun, 2 Feb 2025 13:34:38 +0100
Subject: [PATCH] NewsDownloader: process HTML with cre.getBalancedHTML() to
 ensure self-closing tags like <hr> are closed like <hr/> (#13188)

Works around the issue in <https://github.com/koreader/koreader/issues/13173#issuecomment-2628027654>.
---
 plugins/newsdownloader.koplugin/epubdownloadbackend.lua | 8 ++++++++
 1 file changed, 8 insertions(+)
diff --git a/plugins/newsdownloader.koplugin/epubdownloadbackend.lua b/plugins/newsdownloader.koplugin/epubdownloadbackend.lua
index 85d60c637..81576eb19 100644
--- a/plugins/newsdownloader.koplugin/epubdownloadbackend.lua
+++ b/plugins/newsdownloader.koplugin/epubdownloadbackend.lua
@@ -328,6 +328,14 @@ function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, me
     local cancelled = false
     local page_htmltitle = html:match([[<title[^>]*>(.-)</title>]])
     logger.dbg("page_htmltitle is ", page_htmltitle)
+
+    -- Rejigger HTML into XHTML to avoid unclosed elements. See <https://github.com/koreader/crengine/pull/370#issuecomment-910156921>.
+    local cre = require("libs/libkoreader-cre")
+    html = cre.getBalancedHTML(html, 0x0)
+
+    -- Remove all script tags to save a few bytes.
+    html = html:gsub("<script.->.-</script>", "")
+
 --    local sections = html.sections -- Wikipedia provided TOC
     local bookid = "bookid_placeholder" --string.format("wikipedia_%s_%s_%s", lang, phtml.pageid, phtml.revid)
     -- Not sure if this bookid may ever be used by indexing software/calibre, but if it is,