mirror of
https://github.com/koreader/koreader.git
synced 2025-08-10 00:52:38 +00:00
OPDSParser: Attempt to preserve data from content tags *without* breaking luxl (#7768)
Tackle the content blocks issue differently, in order to preserve the data, which is now useful since #7767
This commit is contained in:
@@ -72,10 +72,6 @@ function OPDSParser:createFlatXTable(xlex, curr_element)
|
||||
end
|
||||
|
||||
function OPDSParser:parse(text)
|
||||
-- Murder Calibre's whole "content" block, because luxl doesn't really deal well with various XHTML quirks,
|
||||
-- as the list of crappy replacements below attests to...
|
||||
-- There's also a high probability of finding orphaned tags or badly nested ones in there, which will screw everything up.
|
||||
text = text:gsub('<content type="xhtml">.-</content>', '')
|
||||
-- luxl doesn't handle XML comments, so strip them
|
||||
text = text:gsub("<!%-%-.-%-%->", "")
|
||||
-- luxl is also particular about the syntax for self-closing, empty & orphaned tags...
|
||||
@@ -84,8 +80,18 @@ function OPDSParser:parse(text)
|
||||
text = text:gsub("<([bh]r)>", "<%1 />")
|
||||
-- Some OPDS catalogs wrap text in a CDATA section, remove it as it causes parsing problems
|
||||
text = text:gsub("<!%[CDATA%[(.-)%]%]>", function (s)
|
||||
return s:gsub( "%p", {["&"] = "&", ["<"] = "<", [">"] = ">" } )
|
||||
return s:gsub("%p", {["&"] = "&", ["<"] = "<", [">"] = ">"})
|
||||
end )
|
||||
|
||||
-- NOTE: OPDS content tags are likely to contain a bunch of HTML or XHTML. We do *NOT* want to let luxl parse that,
|
||||
-- because it doesn't really deal well with various XHTML quirks, as the list of crappy replacements above attests to...
|
||||
-- There's also a high probability of finding orphaned tags or badly nested ones in there, which would screw everything up.
|
||||
-- In any case, we just want to treat the whole thing as a single text node anyway, so, just mangle the markup to force luxl's hand.
|
||||
text = text:gsub('<content type=".-">', "<content>")
|
||||
text = text:gsub("<content>(.-)</content>", function (s)
|
||||
return '<content type="text">' .. s:gsub("%p", {["<"] = "<", [">"] = ">", ['"'] = """, ["'"] = "'"}) .. "</content>"
|
||||
end )
|
||||
|
||||
local xlex = luxl.new(text, #text)
|
||||
return assert(self:createFlatXTable(xlex))
|
||||
end
|
||||
|
||||
Reference in New Issue
Block a user