NewsDownloader: process HTML with cre.getBalancedHTML() to ensure self-closing tags like <hr> are closed like <hr/> (#13188)
Some checks are pending
macos / macOS ${{ matrix.image }} ${{ matrix.platform }} 🔨${{ matrix.xcode_version }} 🎯${{ matrix.deployment_target }} (10.15, 13, x86-64, 15.2) (push) Waiting to run
macos / macOS ${{ matrix.image }} ${{ matrix.platform }} 🔨${{ matrix.xcode_version }} 🎯${{ matrix.deployment_target }} (11.0, 14, ARM64, 15.4) (push) Waiting to run

Works around the issue in <https://github.com/koreader/koreader/issues/13173#issuecomment-2628027654>.
This commit is contained in:
Frans de Jonge
2025-02-02 13:34:38 +01:00
committed by GitHub
parent f516d21913
commit 56c08020d7

View File

@@ -328,6 +328,14 @@ function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, me
local cancelled = false
local page_htmltitle = html:match([[<title[^>]*>(.-)</title>]])
logger.dbg("page_htmltitle is ", page_htmltitle)
-- Rejigger HTML into XHTML to avoid unclosed elements. See <https://github.com/koreader/crengine/pull/370#issuecomment-910156921>.
local cre = require("libs/libkoreader-cre")
html = cre.getBalancedHTML(html, 0x0)
-- Remove all script tags to save a few bytes.
html = html:gsub("<script.->.-</script>", "")
-- local sections = html.sections -- Wikipedia provided TOC
local bookid = "bookid_placeholder" --string.format("wikipedia_%s_%s_%s", lang, phtml.pageid, phtml.revid)
-- Not sure if this bookid may ever be used by indexing software/calibre, but if it is,