From a2c5d39d85680ed0ee2e9b17830f02567484c82d Mon Sep 17 00:00:00 2001 From: Benoit Pierre Date: Thu, 19 Jun 2025 23:37:59 +0200 Subject: [PATCH] gettext: switch to binary catalogs Binary catalogs are more space efficient (total translations disk usage: 14MB instead of 35MB), and easier and faster (roughly 10 times) to load: no parsing, and less processing (no need to unescape strings). Not much difference on the Android APK size (a ~600KB reduction), but other TAR.GZ / ZIP distributions see a reduction of ~3.5MB. --- Makefile | 28 +--- frontend/gettext.lua | 327 +++++++++++++++++++++++-------------- make/gettext.mk | 34 ++++ spec/unit/gettext_spec.lua | 71 +++----- 4 files changed, 264 insertions(+), 196 deletions(-) create mode 100644 make/gettext.mk diff --git a/Makefile b/Makefile index c9c252089..b6f908e2e 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -PHONY = all android-ndk android-sdk base clean distclean doc fetchthirdparty po pot re static-check +PHONY = all android-ndk android-sdk base clean distclean doc fetchthirdparty re static-check SOUND = $(INSTALL_DIR)/% # koreader-base directory @@ -81,6 +81,7 @@ ev_replay.py help history l10n/templates +l10n/*/*.po ota resources/fonts* resources/icons/src* @@ -134,7 +135,7 @@ $(foreach a,$1,'$(if $(filter --%,$a),$a,$(abspath $a))') $(or $2,koreader) $(call release_excludes,$(or $2,koreader)/) endef -all: base +all: base mo install -d $(INSTALL_DIR)/koreader rm -f $(INSTALL_DIR)/koreader/git-rev; echo "$(VERSION)" > $(INSTALL_DIR)/koreader/git-rev ifdef ANDROID @@ -190,7 +191,7 @@ else git submodule update --jobs 3 --init --recursive endif -clean: base-clean +clean: base-clean mo-clean rm -rf $(INSTALL_DIR) distclean: clean base-distclean @@ -209,31 +210,14 @@ ifneq (,$(wildcard make/$(TARGET).mk)) include make/$(TARGET).mk endif +include make/gettext.mk + android-ndk: $(MAKE) -C $(KOR_BASE)/toolchain $(ANDROID_NDK_HOME) android-sdk: $(MAKE) -C $(KOR_BASE)/toolchain $(ANDROID_HOME) -# for gettext -DOMAIN=koreader -TEMPLATE_DIR=l10n/templates -XGETTEXT_BIN=xgettext - -pot: po - mkdir -p $(TEMPLATE_DIR) - $(XGETTEXT_BIN) --from-code=utf-8 \ - --keyword=C_:1c,2 --keyword=N_:1,2 --keyword=NC_:1c,2,3 \ - --add-comments=@translators \ - reader.lua `find frontend -iname "*.lua" | sort` \ - `find plugins -iname "*.lua" | sort` \ - `find tools -iname "*.lua" | sort` \ - -o $(TEMPLATE_DIR)/$(DOMAIN).pot - -po: - git submodule update --remote l10n - - static-check: @if which luacheck > /dev/null; then \ luacheck -q {reader,setupkoenv,datastorage}.lua frontend plugins spec; \ diff --git a/frontend/gettext.lua b/frontend/gettext.lua index b74ea73b8..e4895a88f 100644 --- a/frontend/gettext.lua +++ b/frontend/gettext.lua @@ -21,6 +21,12 @@ See @{ffi.util.template}() for more information about the template function. local isAndroid, android = pcall(require, "android") local logger = require("logger") +local buffer = require("string.buffer") +local ffi = require("ffi") +local C = ffi.C + +require "table.new" +require "ffi/posix_h" local GetText = { context = {}, @@ -61,21 +67,6 @@ function GetText_mt.__call(gettext, msgid) return gettext.translation[msgid] and gettext.translation[msgid][0] or gettext.translation[msgid] or gettext.wrapUntranslated(msgid) end -local function c_escape(what_full, what) - if what == "\n" then return "" - elseif what == "a" then return "\a" - elseif what == "b" then return "\b" - elseif what == "f" then return "\f" - elseif what == "n" then return "\n" - elseif what == "r" then return "\r" - elseif what == "t" then return "\t" - elseif what == "v" then return "\v" - elseif what == "0" then return "\0" -- shouldn't happen, though - else - return what_full - end -end - --- Converts C logical operators to Lua. local function logicalCtoLua(logical_str) logical_str = logical_str:gsub("&&", "and") @@ -137,9 +128,13 @@ local function getPluralFunc(pl_tests, nplurals, plural_default) end local function addTranslation(msgctxt, msgid, msgstr, n) - -- translated string - local unescaped_string = string.gsub(msgstr, "(\\(.))", c_escape) - if msgctxt and msgctxt ~= "" then + assert(not msgctxt or msgctxt ~= "") + assert(msgid and msgid ~= "") + assert(msgstr) + if msgstr == "" then + return + end + if msgctxt then if not GetText.context[msgctxt] then GetText.context[msgctxt] = {} end @@ -147,26 +142,22 @@ local function addTranslation(msgctxt, msgid, msgstr, n) if not GetText.context[msgctxt][msgid] then GetText.context[msgctxt][msgid] = {} end - GetText.context[msgctxt][msgid][n] = unescaped_string ~= "" and unescaped_string or nil + GetText.context[msgctxt][msgid][n] = msgstr else - GetText.context[msgctxt][msgid] = unescaped_string ~= "" and unescaped_string or nil + GetText.context[msgctxt][msgid] = msgstr end else if n then if not GetText.translation[msgid] then GetText.translation[msgid] = {} end - GetText.translation[msgid][n] = unescaped_string ~= "" and unescaped_string or nil + GetText.translation[msgid][n] = msgstr else - GetText.translation[msgid] = unescaped_string ~= "" and unescaped_string or nil + GetText.translation[msgid] = msgstr end end end --- for PO file syntax, see --- https://www.gnu.org/software/gettext/manual/html_node/PO-Files.html --- we only implement a sane subset for now - function GetText_mt.__index.changeLang(new_lang) GetText.context = {} GetText.translation = {} @@ -180,119 +171,210 @@ function GetText_mt.__index.changeLang(new_lang) -- strip encoding suffix in locale like "zh_CN.utf8" new_lang = new_lang:sub(1, new_lang:find(".%.")) - local file = GetText.dirname .. "/" .. new_lang .. "/" .. GetText.textdomain .. ".po" - local po = io.open(file, "r") - - if not po then - logger.dbg("cannot open translation file:", file) + local mo = GetText.dirname .. "/" .. new_lang .. "/" .. GetText.textdomain .. ".mo" + if not GetText.loadMO(mo) then return false end - local data = {} - local in_comments = false - local fuzzy = false - local headers - local what = nil - while true do - local line = po:read("*l") - if line == nil or line == "" then - if data.msgid and data.msgid_plural and data["msgstr[0]"] then - for k, v in pairs(data) do - local n = tonumber(k:match("msgstr%[([0-9]+)%]")) - local msgstr = v + GetText.current_lang = new_lang + return true +end - if n and msgstr and msgstr ~= "" then - addTranslation(data.msgctxt, data.msgid, msgstr, n) - end - end - elseif data.msgid and data.msgstr and data.msgstr ~= "" then - -- header - if not headers and data.msgid == "" then - headers = data.msgstr - local plural_forms = data.msgstr:match("Plural%-Forms: (.*)") - local nplurals = plural_forms:match("nplurals=([0-9]+);") or 2 - local plurals = plural_forms:match("plural=%((.*)%);") +local function parse_headers(headers) + local plural_forms = headers:match("Plural%-Forms: (.*)") + local nplurals = plural_forms:match("nplurals=([0-9]+);") or 2 + local plurals = plural_forms:match("plural=%((.*)%);") - -- Hardcoded workaround for Hebrew which has 4 plural forms. - if plurals == "n == 1) ? 0 : ((n == 2) ? 1 : ((n > 10 && n % 10 == 0) ? 2 : 3)" then - plurals = "n == 1 ? 0 : (n == 2) ? 1 : (n > 10 && n % 10 == 0) ? 2 : 3" - end - -- Hardcoded workaround for Latvian. - if plurals == "n % 10 == 0 || n % 100 >= 11 && n % 100 <= 19) ? 0 : ((n % 10 == 1 && n % 100 != 11) ? 1 : 2" then - plurals = "n % 10 == 0 || n % 100 >= 11 && n % 100 <= 19 ? 0 : (n % 10 == 1 && n % 100 != 11) ? 1 : 2" - end - -- Hardcoded workaround for Romanian which has 3 plural forms. - if plurals == "n == 1) ? 0 : ((n == 0 || n != 1 && n % 100 >= 1 && n % 100 <= 19) ? 1 : 2" then - plurals = "n == 1 ? 0 : (n == 0 || n != 1 && n % 100 >= 1 && n % 100 <= 19) ? 1 : 2" - end + -- Hardcoded workaround for Hebrew which has 4 plural forms. + if plurals == "n == 1) ? 0 : ((n == 2) ? 1 : ((n > 10 && n % 10 == 0) ? 2 : 3)" then + plurals = "n == 1 ? 0 : (n == 2) ? 1 : (n > 10 && n % 10 == 0) ? 2 : 3" + end + -- Hardcoded workaround for Latvian. + if plurals == "n % 10 == 0 || n % 100 >= 11 && n % 100 <= 19) ? 0 : ((n % 10 == 1 && n % 100 != 11) ? 1 : 2" then + plurals = "n % 10 == 0 || n % 100 >= 11 && n % 100 <= 19 ? 0 : (n % 10 == 1 && n % 100 != 11) ? 1 : 2" + end + -- Hardcoded workaround for Romanian which has 3 plural forms. + if plurals == "n == 1) ? 0 : ((n == 0 || n != 1 && n % 100 >= 1 && n % 100 <= 19) ? 1 : 2" then + plurals = "n == 1 ? 0 : (n == 0 || n != 1 && n % 100 >= 1 && n % 100 <= 19) ? 1 : 2" + end - if not plurals then - -- Some languages (e.g., Arabic) may not use parentheses. - -- However, the following more inclusive match is more likely - -- to accidentally include junk and seldom relevant. - -- We might also be dealing with a language without plurals. - -- That would look like `plural=0`. - plurals = plural_forms:match("plural=(.*);") - end + if not plurals then + -- Some languages (e.g., Arabic) may not use parentheses. + -- However, the following more inclusive match is more likely + -- to accidentally include junk and seldom relevant. + -- We might also be dealing with a language without plurals. + -- That would look like `plural=0`. + plurals = plural_forms:match("plural=(.*);") + end - if plurals:find("[^n!=%%<>&:%(%)|?0-9 ]") then - -- we don't trust this input, go with default instead - plurals = GetText.plural_default - end + if plurals:find("[^n!=%%<>&:%(%)|?0-9 ]") then + -- we don't trust this input, go with default instead + plurals = GetText.plural_default + end - local pl_tests = {} - for pl_test in plurals:gmatch("[^:]+") do - table.insert(pl_tests, pl_test) - end + local pl_tests = {} + for pl_test in plurals:gmatch("[^:]+") do + table.insert(pl_tests, pl_test) + end - GetText.getPlural = getPluralFunc(pl_tests, nplurals, GetText.plural_default) - if not GetText.getPlural then - GetText.getPlural = getDefaultPlural - end - end + GetText.getPlural = getPluralFunc(pl_tests, nplurals, GetText.plural_default) + if not GetText.getPlural then + GetText.getPlural = getDefaultPlural + end +end - addTranslation(data.msgctxt, data.msgid, data.msgstr) +-- for MO file format, see +-- https://www.gnu.org/software/gettext/manual/html_node/MO-Files.html + +ffi.cdef[[ +struct __attribute__((packed)) mo_header { + uint32_t magic; + uint16_t revision_major; + uint16_t revision_minor; + uint32_t nb_strings; + uint32_t original_strings_table_offset; + uint32_t translated_strings_table_offset; + uint32_t hash_table_size; + uint32_t hash_table_offset; +}; + +struct __attribute__((packed)) mo_string_table { + uint32_t length; + uint32_t offset; +}; +]] +local MO_MAGIC = 0x950412de + +function GetText_mt.__index.loadMO(file) + local fd = C.open(file, C.O_RDONLY) + if fd < 0 then + logger.dbg(string.format("cannot open translation file: %s", file)) + return false + end + local strerror = function() + return ffi.string(C.strerror(ffi.errno())) + end + local seek_and_read = function(off, ptr, len) + local ret + ret = C.lseek(fd, off, C.SEEK_SET) + if ret ~= off then + logger.err(string.format("loading translation file failed: %s [%s]", file, ret < 0 and strerror() or "lseek")) + return false + end + ret = C.read(fd, ptr, len) + if ret ~= len then + logger.err(string.format("loading translation file failed: %s [%s]", file), ret < 0 and strerror() or "short read") + return false + end + return true + end + local mo_hdr = ffi.new("struct mo_header") + if not seek_and_read(0, mo_hdr, ffi.sizeof(mo_hdr)) then + C.close(fd) + return false + end + if mo_hdr.magic ~= MO_MAGIC then + logger.err(string.format("bad translation file: %s [magic]", file)) + C.close(fd) + return false + end + if mo_hdr.revision_major ~= 0 then + logger.err(string.format("bad translation file: %s [revision]", file)) + C.close(fd) + return false + end + local table_buf = buffer:new() + local table_size = mo_hdr.nb_strings * ffi.sizeof("struct mo_string_table") + local table_ptr = table_buf:reserve(table_size) + local read_strings_count + local read_strings = function(check_for_context) + local m_str_tbl = ffi.cast("struct mo_string_table *", table_ptr) + local str_buf = buffer:new() + read_strings_count = -1 + return function() + read_strings_count = read_strings_count + 1 + if read_strings_count >= mo_hdr.nb_strings then + return end - -- stop at EOF: - if line == nil then break end - data = {} - what = nil - else - -- comment - if line:match("^#") then - if not in_comments then - in_comments = true - fuzzy = false + local str_len = m_str_tbl[read_strings_count].length + local str_off = m_str_tbl[read_strings_count].offset + local str_ptr = str_buf:reserve(str_len) + if not seek_and_read(str_off, str_ptr, str_len) then + return + end + local ctx + local pos = 0 + if check_for_context then + -- 4: ␄ (End of Transmission). + local p = C.memchr(str_ptr, 4, str_len) + if p ~= nil then + local l = ffi.cast("ssize_t", p) - ffi.cast("ssize_t", str_ptr) + ctx = ffi.string(str_ptr, l) + pos = l + 1 end - if line:match(", fuzzy") then - fuzzy = true + end + local l = C.strnlen(str_ptr + pos, str_len - pos) + if l + pos < str_len then + -- Plurals! + local strings = {ffi.string(str_ptr + pos, l)} + pos = pos + l + 1 + while pos < str_len do + l = C.strnlen(str_ptr + pos, str_len - pos) + table.insert(strings, ffi.string(str_ptr + pos, l)) + pos = pos + l + 1 end - elseif fuzzy then - in_comments = false + return read_strings_count + 1, strings, ctx else - in_comments = false - -- new data item (msgid, msgstr, ... - local w, s = line:match("^%s*([%a_%[%]0-9]+)%s+\"(.*)\"%s*$") - if w then - what = w - else - -- string continuation - s = line:match("^%s*\"(.*)\"%s*$") - end - if what and s then - -- unescape \n or msgid won't match - s = s:gsub("\\n", "\n") - -- unescape " or msgid won't match - s = s:gsub('\\"', '"') - -- unescape \\ or msgid won't match - s = s:gsub("\\\\", "\\") - data[what] = (data[what] or "") .. s - end + return read_strings_count + 1, ffi.string(str_ptr + pos, str_len - pos), ctx end end end - po:close() - GetText.current_lang = new_lang + -- Read original strings. + if not seek_and_read(mo_hdr.original_strings_table_offset, table_ptr, table_size) then + C.close(fd) + return false + end + local original_context = {} + local original_strings = table.new(mo_hdr.nb_strings, 0) + for n, s, ctx in read_strings(true) do + if ctx then + original_context[n] = ctx + end + original_strings[n] = s + end + if read_strings_count ~= mo_hdr.nb_strings then + C.close(fd) + return false + end + -- Read translated strings. + if not seek_and_read(mo_hdr.translated_strings_table_offset, table_ptr, table_size) then + C.close(fd) + return false + end + for n, ts in read_strings() do + local ctx = original_context[n] + local os = original_strings[n] + if type(os) == "table" then + if type(ts) == "table" then + for pn, pts in ipairs(ts) do + addTranslation(ctx, os[1], pts, pn - 1) + end + else + addTranslation(ctx, os[1], ts, 0) + end + elseif type(ts) == "table" then + logger.warn(string.format("bad translation file: %s [singular / plurals mismatch]", file)) + else + if n == 1 and #os == 0 then + parse_headers(ts) + else + addTranslation(ctx, os, ts) + end + end + end + local ok = read_strings_count == mo_hdr.nb_strings + C.close(fd) + return ok end GetText_mt.__index.getPlural = getDefaultPlural @@ -412,7 +494,6 @@ elseif os.getenv("LANG") then end if isAndroid then - local ffi = require("ffi") local buf = ffi.new("char[?]", 16) android.lib.AConfiguration_getLanguage(android.app.config, buf) local lang = ffi.string(buf) diff --git a/make/gettext.mk b/make/gettext.mk new file mode 100644 index 000000000..2980c6900 --- /dev/null +++ b/make/gettext.mk @@ -0,0 +1,34 @@ +PHONY += mo mo-clean po pot + +SELF := $(lastword $(MAKEFILE_LIST)) + +DOMAIN = koreader +TEMPLATE_DIR = l10n/templates +MSGFMT_BIN = msgfmt +XGETTEXT_BIN = xgettext + +PO_FILES = $(wildcard l10n/*/*.po) +MO_FILES = $(PO_FILES:%.po=%.mo) + +%.mo: %.po + # FIXME: use `--check`? + @$(MSGFMT_BIN) --no-hash -o $@ $< + +mo: + $(MAKE) $(if $(PARALLEL_JOBS),--jobs=$(PARALLEL_JOBS)) $(if $(PARALLEL_LOAD),--load-average=$(PARALLEL_LOAD)) --silent --file=$(SELF) $(MO_FILES) + +mo-clean: + rm -f $(MO_FILES) + +pot: po + mkdir -p $(TEMPLATE_DIR) + $(XGETTEXT_BIN) --from-code=utf-8 \ + --keyword=C_:1c,2 --keyword=N_:1,2 --keyword=NC_:1c,2,3 \ + --add-comments=@translators \ + reader.lua `find frontend -iname "*.lua" | sort` \ + `find plugins -iname "*.lua" | sort` \ + `find tools -iname "*.lua" | sort` \ + -o $(TEMPLATE_DIR)/$(DOMAIN).pot + +po: + git submodule update --remote l10n diff --git a/spec/unit/gettext_spec.lua b/spec/unit/gettext_spec.lua index eb3b0107d..29e802b41 100644 --- a/spec/unit/gettext_spec.lua +++ b/spec/unit/gettext_spec.lua @@ -106,75 +106,44 @@ msgstr "Fuzzy translated" describe("GetText module", function() local GetText - local test_po_ar - local test_po_nl, test_po_ru - local test_po_none, test_po_simple - local test_po_many setup(function() require("commonrequire") GetText = require("gettext") - GetText.dirname = "i18n-test" + GetText.dirname = (os.getenv("KO_HOME") or ".").."/i18n-test" local lfs = require("libs/libkoreader-lfs") + lfs.mkdir(GetText.dirname) - lfs.mkdir(GetText.dirname.."/nl_NL") - lfs.mkdir(GetText.dirname.."/none") - lfs.mkdir(GetText.dirname.."/ar") - lfs.mkdir(GetText.dirname.."/ru") - lfs.mkdir(GetText.dirname.."/simple") - lfs.mkdir(GetText.dirname.."/many") - test_po_nl = GetText.dirname.."/nl_NL/koreader.po" - local f = io.open(test_po_nl, "w") - f:write(test_po_part1, test_plurals_nl, test_po_part2) - f:close() + local pocreate = function(lang, ...) + local dir = GetText.dirname.."/"..lang + local po = dir.."/koreader.po" + local mo = dir.."/koreader.mo" + lfs.mkdir(dir) + local f = io.open(po, "w") + f:write(...) + f:close() + local ok = os.execute(string.format("msgfmt --no-hash -o %s %s", mo, po)) + assert(ok == 0) + end + + pocreate("nl_NL", test_po_part1, test_plurals_nl, test_po_part2) -- same file, just different plural for testing - test_po_none = GetText.dirname.."/none/koreader.po" - f = io.open(test_po_none, "w") - f:write(test_po_part1, test_plurals_none, test_po_part2) - f:close() + pocreate("none", test_po_part1, test_plurals_none, test_po_part2) -- same file, just different plural for testing - test_po_ar = GetText.dirname.."/ar/koreader.po" - f = io.open(test_po_ar, "w") - f:write(test_po_part1, test_plurals_ar, test_po_part2) - f:close() + pocreate("ar", test_po_part1, test_plurals_ar, test_po_part2) -- same file, just different plural for testing - test_po_ru = GetText.dirname.."/ru/koreader.po" - f = io.open(test_po_ru, "w") - f:write(test_po_part1, test_plurals_ru, test_po_part2) - f:close() + pocreate("ru", test_po_part1, test_plurals_ru, test_po_part2) -- same file, just different plural for testing - test_po_simple = GetText.dirname.."/simple/koreader.po" - f = io.open(test_po_simple, "w") - f:write(test_po_part1, test_plurals_simple, test_po_part2) - f:close() + pocreate("simple", test_po_part1, test_plurals_simple, test_po_part2) -- same file, just different plural for testing - test_po_many = GetText.dirname.."/many/koreader.po" - f = io.open(test_po_many, "w") - f:write(test_po_part1, test_plurals_many, test_po_part2) - f:close() - end) - - teardown(function() - os.remove(test_po_nl) - os.remove(test_po_none) - os.remove(test_po_ar) - os.remove(test_po_ru) - os.remove(test_po_simple) - os.remove(test_po_many) - os.remove(GetText.dirname.."/nl_NL") - os.remove(GetText.dirname.."/none") - os.remove(GetText.dirname.."/ar") - os.remove(GetText.dirname.."/ru") - os.remove(GetText.dirname.."/simple") - os.remove(GetText.dirname.."/many") - os.remove(GetText.dirname) + pocreate("many", test_po_part1, test_plurals_many, test_po_part2) end) describe("changeLang", function()