From e315820b477c00913801435efece3e2d14f3690c Mon Sep 17 00:00:00 2001 From: Brent Simmons Date: Mon, 16 Sep 2024 21:56:55 -0700 Subject: [PATCH] Continue progress on HTMLEntityDecoder. --- .../Sources/SAX/HTMLEntityDecoder.swift | 188 +++++++++++++++++- 1 file changed, 182 insertions(+), 6 deletions(-) diff --git a/Modules/Parser/Sources/SAX/HTMLEntityDecoder.swift b/Modules/Parser/Sources/SAX/HTMLEntityDecoder.swift index 78dbc540f..2c629565f 100644 --- a/Modules/Parser/Sources/SAX/HTMLEntityDecoder.swift +++ b/Modules/Parser/Sources/SAX/HTMLEntityDecoder.swift @@ -9,8 +9,6 @@ import Foundation public final class HTMLEntityDecoder { - static let ampersandCharacter = Character("&") - public static func decodedString(_ encodedString: String) -> String { let scanner = EntityScanner(string: encodedString) @@ -19,7 +17,7 @@ public final class HTMLEntityDecoder { while true { - let scannedString = scanner.scanUpTo(Self.ampersandCharacter) + let scannedString = scanner.scanUpToAmpersand() if !scannedString.isEmpty { result.append(scannedString) } @@ -73,12 +71,15 @@ final class EntityScanner { self.count = string.count } + static let ampersandCharacter = Character("&") + /// Scans up to `characterToFind` and returns the characters up to (and not including) `characterToFind`. /// - Returns: the scanned portion before `characterToFind`. May be empty string. - func scanUpTo(_ characterToFind: Character) -> String { + func scanUpToAmpersand() -> String { + let characterToFind = Self.ampersandCharacter var scanned = "" - + while true { guard let ch = currentCharacter else { @@ -166,8 +167,183 @@ extension String { } } -/// rawEntity is assumed not to have opening `&` and closing `;`. +/// rawEntity may or may not have leading `&` and/or trailing `;` characters. private func decodedEntity(_ rawEntity: String) -> String? { + var s = rawEntity + + if s.hasPrefix("&") { + s.removeFirst() + } + if s.hasSuffix(";") { + s.removeLast() + } + + if let decodedEntity = entitiesDictionary[s] { + return decodedEntity + } + + if s.hasPrefix("#x") || s.hasPrefix("#X") { // Hex + let scanner = Scanner(string: s) + scanner.charactersToBeSkipped = CharacterSet(charactersIn: "#xX") + var hexValue: UInt64 = 0 + if scanner.scanHexInt64(&hexValue) { + return stringWithValue(UInt32(hexValue)) + } + return nil + } + + else if s.hasPrefix("#") { + s.removeFirst() + guard let value = UInt32(s), value >= 1 else { + return nil + } + return stringWithValue(value) + } + return nil } + +private func stringWithValue(_ value: UInt32) -> String? { + + // From WebCore's HTMLEntityParser + let windowsLatin1ExtensionArray: [UInt32] = [ + 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87 + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F + 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97 + 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 // 98-9F + ] + + var modifiedValue = value + + if (modifiedValue & ~0x1F) == 0x80 { // value >= 128 && value < 160 + modifiedValue = windowsLatin1ExtensionArray[Int(modifiedValue - 0x80)] + } + + modifiedValue = CFSwapInt32HostToLittle(modifiedValue) + + let data = Data(bytes: &modifiedValue, count: MemoryLayout.size(ofValue: modifiedValue)) + + return String(data: data, encoding: .utf32LittleEndian) +} + +private let entitiesDictionary = + [ + "AElig": "Æ", + "Aacute": "Á", + "Acirc": "Â", + "Agrave": "À", + "Aring": "Å", + "Atilde": "Ã", + "Auml": "Ä", + "Ccedil": "Ç", + "Dstrok": "Ð", + "ETH": "Ð", + "Eacute": "É", + "Ecirc": "Ê", + "Egrave": "È", + "Euml": "Ë", + "Iacute": "Í", + "Icirc": "Î", + "Igrave": "Ì", + "Iuml": "Ï", + "Ntilde": "Ñ", + "Oacute": "Ó", + "Ocirc": "Ô", + "Ograve": "Ò", + "Oslash": "Ø", + "Otilde": "Õ", + "Ouml": "Ö", + "Pi": "Π", + "THORN": "Þ", + "Uacute": "Ú", + "Ucirc": "Û", + "Ugrave": "Ù", + "Uuml": "Ü", + "Yacute": "Y", + "aacute": "á", + "acirc": "â", + "acute": "´", + "aelig": "æ", + "agrave": "à", + "amp": "&", + "apos": "'", + "aring": "å", + "atilde": "ã", + "auml": "ä", + "brkbar": "¦", + "brvbar": "¦", + "ccedil": "ç", + "cedil": "¸", + "cent": "¢", + "copy": "©", + "curren": "¤", + "deg": "°", + "die": "¨", + "divide": "÷", + "eacute": "é", + "ecirc": "ê", + "egrave": "è", + "eth": "ð", + "euml": "ë", + "euro": "€", + "frac12": "½", + "frac14": "¼", + "frac34": "¾", + "gt": ">", + "hearts": "♥", + "hellip": "…", + "iacute": "í", + "icirc": "î", + "iexcl": "¡", + "igrave": "ì", + "iquest": "¿", + "iuml": "ï", + "laquo": "«", + "ldquo": "“", + "lsquo": "‘", + "lt": "<", + "macr": "¯", + "mdash": "—", + "micro": "µ", + "middot": "·", + "ndash": "–", + "not": "¬", + "ntilde": "ñ", + "oacute": "ó", + "ocirc": "ô", + "ograve": "ò", + "ordf": "ª", + "ordm": "º", + "oslash": "ø", + "otilde": "õ", + "ouml": "ö", + "para": "¶", + "pi": "π", + "plusmn": "±", + "pound": "£", + "quot": "\"", + "raquo": "»", + "rdquo": "”", + "reg": "®", + "rsquo": "’", + "sect": "§", + "shy": stringWithValue(173), + "sup1": "¹", + "sup2": "²", + "sup3": "³", + "szlig": "ß", + "thorn": "þ", + "times": "×", + "trade": "™", + "uacute": "ú", + "ucirc": "û", + "ugrave": "ù", + "uml": "¨", + "uuml": "ü", + "yacute": "y", + "yen": "¥", + "yuml": "ÿ", + "infin": "∞", + "nbsp": stringWithValue(160) + ]