mirror of
https://github.com/koreader/koreader.git
synced 2025-08-10 00:52:38 +00:00
OPDS*: Mangle Calibre feeds some more so that they don't confuse luxl/us (#6902)
By essentially dropping the whole XHTML block, instead of trying to salvage each and every tag one by one as we did before. Also, as that's usually the result after broken parsing, handle nil URLs slightly better in the frontend, so that they get caught/reported properly instead of doing nothing and/or crashing half the time.
This commit is contained in:
@@ -33,13 +33,12 @@ end
|
||||
function OPDSParser:createFlatXTable(xlex, curr_element)
|
||||
curr_element = curr_element or {}
|
||||
|
||||
local curr_attr_name;
|
||||
local attr_count = 0;
|
||||
local curr_attr_name
|
||||
local attr_count = 0
|
||||
|
||||
-- start reading the thing
|
||||
local txt
|
||||
for event, offset, size in xlex:Lexemes() do
|
||||
txt = ffi.string(xlex.buf + offset, size)
|
||||
local txt = ffi.string(xlex.buf + offset, size)
|
||||
if event == luxl.EVENT_START then
|
||||
if txt ~= "xml" then
|
||||
-- does current element already have something
|
||||
@@ -61,7 +60,7 @@ function OPDSParser:createFlatXTable(xlex, curr_element)
|
||||
curr_attr_name = unescape(txt)
|
||||
elseif event == luxl.EVENT_ATTR_VAL then
|
||||
curr_element[curr_attr_name] = unescape(txt)
|
||||
attr_count = attr_count + 1;
|
||||
attr_count = attr_count + 1
|
||||
curr_attr_name = nil
|
||||
elseif event == luxl.EVENT_TEXT then
|
||||
curr_element = unescape(txt)
|
||||
@@ -73,16 +72,23 @@ function OPDSParser:createFlatXTable(xlex, curr_element)
|
||||
end
|
||||
|
||||
function OPDSParser:parse(text)
|
||||
-- luxl cannot properly handle xml comments and we need first remove them
|
||||
text = text:gsub("<!--.--->", "")
|
||||
-- luxl prefers <br />, other two forms are valid in HTML,
|
||||
-- but will kick the ass of luxl
|
||||
-- Murder Calibre's whole "content" block, because luxl doesn't really deal well with various XHTML quirks,
|
||||
-- as the list of crappy replacements below attests to...
|
||||
-- There's also a high probability of finding orphaned tags or badly nested ones in there, which will screw everything up.
|
||||
text = text:gsub('<content type="xhtml">.-</content>', '')
|
||||
-- luxl doesn't handle XML comments, so strip them
|
||||
text = text:gsub("<!%-%-.-%-%->", "")
|
||||
-- luxl prefers <br />, the other two forms are valid in HTML, but will kick luxl's ass
|
||||
text = text:gsub("<br>", "<br />")
|
||||
text = text:gsub("<br/>", "<br />")
|
||||
-- Same deal with hr
|
||||
text = text:gsub("<hr>", "<hr />")
|
||||
text = text:gsub("<hr/>", "<hr />")
|
||||
-- some OPDS catalogs wrap text in a CDATA section, remove it as it causes parsing problems
|
||||
-- It's also allergic to orphaned <em/> (As opposed to a balanced <em></em> pair)...
|
||||
text = text:gsub("<em/>", "")
|
||||
-- Let's assume it might also happen to strong...
|
||||
text = text:gsub("<strong/>", "")
|
||||
-- Some OPDS catalogs wrap text in a CDATA section, remove it as it causes parsing problems
|
||||
text = text:gsub("<!%[CDATA%[(.-)%]%]>", function (s)
|
||||
return s:gsub( "%p", {["&"] = "&", ["<"] = "<", [">"] = ">" } )
|
||||
end )
|
||||
|
||||
@@ -78,7 +78,7 @@ function OPDSBrowser:init()
|
||||
servers = {
|
||||
{
|
||||
title = "Project Gutenberg",
|
||||
url = "http://m.gutenberg.org/ebooks.opds/?format=opds",
|
||||
url = "https://m.gutenberg.org/ebooks.opds/?format=opds",
|
||||
},
|
||||
{
|
||||
title = "Project Gutenberg [Searchable]",
|
||||
@@ -87,11 +87,11 @@ function OPDSBrowser:init()
|
||||
},
|
||||
{
|
||||
title = "Feedbooks",
|
||||
url = "http://www.feedbooks.com/publicdomain/catalog.atom",
|
||||
url = "https://catalog.feedbooks.com/catalog/public_domain.atom",
|
||||
},
|
||||
{
|
||||
title = "ManyBooks",
|
||||
url = "http://manybooks.net/opds/index.php",
|
||||
url = "https://manybooks.net/opds/index.php",
|
||||
},
|
||||
{
|
||||
title = "Internet Archive",
|
||||
@@ -99,11 +99,11 @@ function OPDSBrowser:init()
|
||||
},
|
||||
{
|
||||
title = "Flibusta (Russian)",
|
||||
url = "http://www.flibusta.is/opds",
|
||||
url = "https://www.flibusta.is/opds",
|
||||
},
|
||||
{
|
||||
title = "Flibusta [Ru] [Searchable]",
|
||||
url = "http://www.flibusta.is/opds/search?searchTerm=%s",
|
||||
url = "https://www.flibusta.is/opds/search?searchTerm=%s",
|
||||
searchable = true,
|
||||
},
|
||||
{
|
||||
@@ -388,9 +388,9 @@ end
|
||||
function OPDSBrowser:getCatalog(item_url, username, password)
|
||||
local ok, catalog = pcall(self.parseFeed, self, item_url, username, password)
|
||||
if not ok and catalog then
|
||||
logger.info("cannot get catalog info from", item_url, catalog)
|
||||
logger.info("cannot get catalog info from", item_url or "nil", catalog)
|
||||
UIManager:show(InfoMessage:new{
|
||||
text = T(_("Cannot get catalog info from %1"), (BD.url(item_url) or "")),
|
||||
text = T(_("Cannot get catalog info from %1"), (item_url and BD.url(item_url) or "nil")),
|
||||
})
|
||||
return
|
||||
end
|
||||
@@ -498,7 +498,7 @@ function OPDSBrowser:genItemTableFromCatalog(catalog, item_url, username, passwo
|
||||
end
|
||||
end
|
||||
if author then
|
||||
item.text = title .. "\n" .. author
|
||||
item.text = title .. " - " .. author
|
||||
end
|
||||
end
|
||||
item.title = title
|
||||
@@ -676,7 +676,7 @@ function OPDSBrowser:showDownloads(item)
|
||||
end
|
||||
|
||||
function OPDSBrowser:browse(browse_url, username, password)
|
||||
logger.dbg("Browse opds url", browse_url)
|
||||
logger.dbg("Browse opds url", browse_url or "nil")
|
||||
table.insert(self.paths, {
|
||||
url = browse_url,
|
||||
username = username,
|
||||
|
||||
Reference in New Issue
Block a user