diff --git a/Modules/Parser/Sources/SAX/HTMLEntityDecoder.swift b/Modules/Parser/Sources/SAX/HTMLEntityDecoder.swift
index 78dbc540f..2c629565f 100644
--- a/Modules/Parser/Sources/SAX/HTMLEntityDecoder.swift
+++ b/Modules/Parser/Sources/SAX/HTMLEntityDecoder.swift
@@ -9,8 +9,6 @@ import Foundation
public final class HTMLEntityDecoder {
- static let ampersandCharacter = Character("&")
-
public static func decodedString(_ encodedString: String) -> String {
let scanner = EntityScanner(string: encodedString)
@@ -19,7 +17,7 @@ public final class HTMLEntityDecoder {
while true {
- let scannedString = scanner.scanUpTo(Self.ampersandCharacter)
+ let scannedString = scanner.scanUpToAmpersand()
if !scannedString.isEmpty {
result.append(scannedString)
}
@@ -73,12 +71,15 @@ final class EntityScanner {
self.count = string.count
}
+ static let ampersandCharacter = Character("&")
+
/// Scans up to `characterToFind` and returns the characters up to (and not including) `characterToFind`.
/// - Returns: the scanned portion before `characterToFind`. May be empty string.
- func scanUpTo(_ characterToFind: Character) -> String {
+ func scanUpToAmpersand() -> String {
+ let characterToFind = Self.ampersandCharacter
var scanned = ""
-
+
while true {
guard let ch = currentCharacter else {
@@ -166,8 +167,183 @@ extension String {
}
}
-/// rawEntity is assumed not to have opening `&` and closing `;`.
+/// rawEntity may or may not have leading `&` and/or trailing `;` characters.
private func decodedEntity(_ rawEntity: String) -> String? {
+ var s = rawEntity
+
+ if s.hasPrefix("&") {
+ s.removeFirst()
+ }
+ if s.hasSuffix(";") {
+ s.removeLast()
+ }
+
+ if let decodedEntity = entitiesDictionary[s] {
+ return decodedEntity
+ }
+
+ if s.hasPrefix("#x") || s.hasPrefix("#X") { // Hex
+ let scanner = Scanner(string: s)
+ scanner.charactersToBeSkipped = CharacterSet(charactersIn: "#xX")
+ var hexValue: UInt64 = 0
+ if scanner.scanHexInt64(&hexValue) {
+ return stringWithValue(UInt32(hexValue))
+ }
+ return nil
+ }
+
+ else if s.hasPrefix("#") {
+ s.removeFirst()
+ guard let value = UInt32(s), value >= 1 else {
+ return nil
+ }
+ return stringWithValue(value)
+ }
+
return nil
}
+
+private func stringWithValue(_ value: UInt32) -> String? {
+
+ // From WebCore's HTMLEntityParser
+ let windowsLatin1ExtensionArray: [UInt32] = [
+ 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
+ 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
+ 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
+ 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 // 98-9F
+ ]
+
+ var modifiedValue = value
+
+ if (modifiedValue & ~0x1F) == 0x80 { // value >= 128 && value < 160
+ modifiedValue = windowsLatin1ExtensionArray[Int(modifiedValue - 0x80)]
+ }
+
+ modifiedValue = CFSwapInt32HostToLittle(modifiedValue)
+
+ let data = Data(bytes: &modifiedValue, count: MemoryLayout.size(ofValue: modifiedValue))
+
+ return String(data: data, encoding: .utf32LittleEndian)
+}
+
+private let entitiesDictionary =
+ [
+ "AElig": "Æ",
+ "Aacute": "Á",
+ "Acirc": "Â",
+ "Agrave": "À",
+ "Aring": "Å",
+ "Atilde": "Ã",
+ "Auml": "Ä",
+ "Ccedil": "Ç",
+ "Dstrok": "Ð",
+ "ETH": "Ð",
+ "Eacute": "É",
+ "Ecirc": "Ê",
+ "Egrave": "È",
+ "Euml": "Ë",
+ "Iacute": "Í",
+ "Icirc": "Î",
+ "Igrave": "Ì",
+ "Iuml": "Ï",
+ "Ntilde": "Ñ",
+ "Oacute": "Ó",
+ "Ocirc": "Ô",
+ "Ograve": "Ò",
+ "Oslash": "Ø",
+ "Otilde": "Õ",
+ "Ouml": "Ö",
+ "Pi": "Π",
+ "THORN": "Þ",
+ "Uacute": "Ú",
+ "Ucirc": "Û",
+ "Ugrave": "Ù",
+ "Uuml": "Ü",
+ "Yacute": "Y",
+ "aacute": "á",
+ "acirc": "â",
+ "acute": "´",
+ "aelig": "æ",
+ "agrave": "à",
+ "amp": "&",
+ "apos": "'",
+ "aring": "å",
+ "atilde": "ã",
+ "auml": "ä",
+ "brkbar": "¦",
+ "brvbar": "¦",
+ "ccedil": "ç",
+ "cedil": "¸",
+ "cent": "¢",
+ "copy": "©",
+ "curren": "¤",
+ "deg": "°",
+ "die": "¨",
+ "divide": "÷",
+ "eacute": "é",
+ "ecirc": "ê",
+ "egrave": "è",
+ "eth": "ð",
+ "euml": "ë",
+ "euro": "€",
+ "frac12": "½",
+ "frac14": "¼",
+ "frac34": "¾",
+ "gt": ">",
+ "hearts": "♥",
+ "hellip": "…",
+ "iacute": "í",
+ "icirc": "î",
+ "iexcl": "¡",
+ "igrave": "ì",
+ "iquest": "¿",
+ "iuml": "ï",
+ "laquo": "«",
+ "ldquo": "“",
+ "lsquo": "‘",
+ "lt": "<",
+ "macr": "¯",
+ "mdash": "—",
+ "micro": "µ",
+ "middot": "·",
+ "ndash": "–",
+ "not": "¬",
+ "ntilde": "ñ",
+ "oacute": "ó",
+ "ocirc": "ô",
+ "ograve": "ò",
+ "ordf": "ª",
+ "ordm": "º",
+ "oslash": "ø",
+ "otilde": "õ",
+ "ouml": "ö",
+ "para": "¶",
+ "pi": "π",
+ "plusmn": "±",
+ "pound": "£",
+ "quot": "\"",
+ "raquo": "»",
+ "rdquo": "”",
+ "reg": "®",
+ "rsquo": "’",
+ "sect": "§",
+ "shy": stringWithValue(173),
+ "sup1": "¹",
+ "sup2": "²",
+ "sup3": "³",
+ "szlig": "ß",
+ "thorn": "þ",
+ "times": "×",
+ "trade": "™",
+ "uacute": "ú",
+ "ucirc": "û",
+ "ugrave": "ù",
+ "uml": "¨",
+ "uuml": "ü",
+ "yacute": "y",
+ "yen": "¥",
+ "yuml": "ÿ",
+ "infin": "∞",
+ "nbsp": stringWithValue(160)
+ ]