mirror of
https://github.com/koreader/koreader.git
synced 2025-08-10 00:52:38 +00:00
fix handling of invalid UTF8 sequences
external data (and in bad cases our own) can contain invalid byte sequences in UTF8 strings. A prominent example are file names. There was a 1-off bug in calculating the allowed length for multibyte chars, and the iterator was a bit too greedy when stumbling upon invalid sequences, returning a single "invalid" char for a sequence up to the point where it became invalid in calculation. Now, we present one invalid char for the first byte of that sequence and then check for a valid char starting with the next byte.
This commit is contained in:
@@ -44,7 +44,7 @@ local function utf8Chars(input)
|
||||
else
|
||||
return pos+1, 0xFFFD, "\xFF\xFD"
|
||||
end
|
||||
if string.len(input) < (pos + bytes_left - 1) then
|
||||
if string.len(input) < (pos + bytes_left) then
|
||||
return pos+1, 0xFFFD, "\xFF\xFD"
|
||||
end
|
||||
for i = pos+1, pos + bytes_left do
|
||||
@@ -52,7 +52,9 @@ local function utf8Chars(input)
|
||||
if bit.band(value, 0xC0) == 0x80 then
|
||||
glyph = bit.bor(bit.lshift(glyph, 6), bit.band(value, 0x3F))
|
||||
else
|
||||
return i+1, 0xFFFD, "\xFF\xFD"
|
||||
-- invalid UTF8 continuation - don't be greedy, just skip
|
||||
-- the initial char of the sequence.
|
||||
return pos+1, 0xFFFD, "\xFF\xFD"
|
||||
end
|
||||
end
|
||||
-- TODO: check for valid ranges here!
|
||||
|
||||
Reference in New Issue
Block a user