Calibre: Metadata parser improvements (#11922)

* Added a safe pure-Lua SAX JSON parser (via LunaJSON).
* Updated RapidJSON.
* Also implemented a calibre-specific SAX parser in Lua-RapidJSON, and use it by default instead of the full RapidJSON one.
* Raised the file-size threshold to switch between the fast & safe parsers to 50MB.
* Added an UI option to switch between the three parsers.
This commit is contained in:
Martín Fernández
2024-06-06 01:06:46 +02:00
committed by GitHub
parent 04eec52eee
commit 79c13bee0c
4 changed files with 142 additions and 82 deletions

2
base

Submodule base updated: 22a6e4f616...4518f06447

View File

@@ -118,6 +118,11 @@ function Calibre:addToMainMenu(menu_items)
keep_menu_open = true,
sub_item_table = self:getWirelessMenuTable(),
},
{
text = _("JSON parser"),
keep_menu_open = true,
sub_item_table = self:getParserMenuTable(),
},
}
}
-- insert the metadata search
@@ -414,4 +419,50 @@ function Calibre:getWirelessMenuTable()
return t
end
function Calibre:getParserMenuTable()
return {
{
text = _("Automatic"),
help_text = _("The program will decide based on the size of the JSON file. Recommended"),
checked_func = function()
return G_reader_settings:hasNot("calibre_json_parser")
end,
callback = function()
G_reader_settings:delSetting("calibre_json_parser")
end,
},
{
text = _("Fast"),
help_text = _("Faster parsing, but may not take too kindly to malformed input files"),
checked_func = function()
return G_reader_settings:readSetting("calibre_json_parser") == "fast"
end,
callback = function()
G_reader_settings:saveSetting("calibre_json_parser", "fast")
end,
},
{
text = _("Safe"),
help_text = _("Slower, but safer. Useful if you're experiencing problems with the other modes"),
checked_func = function()
return G_reader_settings:readSetting("calibre_json_parser") == "safe"
end,
callback = function()
G_reader_settings:saveSetting("calibre_json_parser", "safe")
end,
},
{
text = _("Legacy"),
help_text = _("Fast, but requires more RAM, only recommended on modest library sizes (or beefier devices)"),
checked_func = function()
return G_reader_settings:readSetting("calibre_json_parser") == "legacy"
end,
callback = function()
G_reader_settings:saveSetting("calibre_json_parser", "legacy")
end,
},
}
end
return Calibre

View File

@@ -52,9 +52,9 @@ local function slim(book, is_search)
return slim_book
end
-- this is the max file size we attempt to decode using json. For larger
-- files we want to attempt to manually parse the file to avoid OOM errors
local MAX_JSON_FILESIZE = 30 * 1000 * 1000
-- This is the max file size we attempt to decode using rapidjson.
-- For larger files we use a sax parser to avoid OOM errors
local MAX_JSON_FILESIZE = 50 * 1024 * 1024
--- find calibre files for a given dir
local function findCalibreFiles(dir)
@@ -121,8 +121,11 @@ function CalibreMetadata:loadBookList()
return {}
end
local books, err
if attr.size > MAX_JSON_FILESIZE then
books, err = parser.parseFile(self.metadata)
local impl = G_reader_settings:readSetting("calibre_json_parser") or attr.size > MAX_JSON_FILESIZE and "safe" or "fast"
if impl == "fast" then
books, err = rapidjson.load_calibre(self.metadata)
elseif impl == "safe" then
books, err = parser.parseFile(self.metadata)
else
books, err = rapidjson.load(self.metadata)
end

View File

@@ -1,90 +1,96 @@
-- A parser for metadata.calibre
local util = require("util")
-- parse "metadata.calibre" files
local lj = require("lunajson")
-- removes leading and closing characters and converts hex-unicodes
local function replaceHexChars(s, n, j)
local l = string.len(s)
if string.sub(s, l, l) == "\"" then
s = string.sub(s, n, string.len(s)-1)
local array_fields = {
authors = true,
tags = true,
series = true,
}
local required_fields = {
authors = true,
last_modified = true,
lpath = true,
series = true,
series_index = true,
size = true,
tags = true,
title = true,
uuid = true,
}
local field
local t = {}
local function append(v)
-- Some fields *may* be arrays, so check whether we ran through startarray first or not
if t[field] then
table.insert(t[field], v)
else
s = string.sub(s, n, string.len(s)-j)
end
s = string.gsub(s, "\\u([a-f0-9][a-f0-9][a-f0-9][a-f0-9])", function(w)
return util.unicodeCodepointToUtf8(tonumber(w, 16))
end)
return s
end
-- a couple of string helper functions for dealing with raw json strings
local function isEqual(str, key)
if str:sub(1, key:len() + 6) == string.format(" \"%s\"", key) then
return true
end
return false
end
local function getValue(str, key)
if str == string.format(" \"%s\": null, ", key) then
return nil
else
return replaceHexChars(str, key:len() + 10, key == "series_index" and 2 or 3)
t[field] = v
field = nil
end
end
local jsonStr = getmetatable("")
jsonStr.__index["equals"] = isEqual
jsonStr.__index["value"] = getValue
local depth = 0
local result = {}
local sax = {
startobject = function()
depth = depth + 1
end,
endobject = function()
if depth == 1 then
table.insert(result, t)
t = {}
end
depth = depth - 1
end,
startarray = function()
if array_fields[field] then
t[field] = {}
end
end,
endarray = function()
if field then
field = nil
end
end,
key = function(s)
if required_fields[s] then
field = s
end
end,
string = function(s)
if field then
append(s)
end
end,
number = function(n)
if field then
append(n)
end
end,
boolean = function(b)
if field then
append(b)
end
end,
}
local function parse_unsafe(path)
local p = lj.newfileparser(path, sax)
p.run()
end
local parser = {}
-- read metadata from file, line by line, and keep just the data we need
function parser.parseFile(file)
assert(type(file) == "string", "wrong type (expected a string")
local f, err = io.open(file, "rb")
if not f then
return nil, string.format("error parsing %s: %s", file, err)
result = {}
local ok, err = pcall(parse_unsafe, file)
field = nil
if not ok then
return nil, err
end
f:close()
local add = function(t, line)
if type(t) ~= "table" or type(line) ~= "string" then
return {}
end
line = replaceHexChars(line, 8, 3)
table.insert(t, #t + 1, line)
return t
end
local books, book = {}, {}
local is_author, is_tag = false, false
for line in io.lines(file) do
if line == " }, " or line == " }" then
if type(book) == "table" then
table.insert(books, #books + 1, book)
end
book = {}
elseif line == " \"authors\": [" then
is_author = true
elseif line == " \"tags\": [" then
is_tag = true
elseif line == " ], " or line == " ]" then
is_author, is_tag = false, false
else
for _, key in ipairs({"title", "uuid", "lpath", "size",
"last_modified", "series", "series_index"})
do
if line:equals(key) then
book[key] = line:value(key)
break
end
end
end
if is_author then
book.authors = add(book.authors, line)
elseif is_tag then
book.tags = add(book.tags, line)
end
end
return books
return result
end
return parser