From e315820b477c00913801435efece3e2d14f3690c Mon Sep 17 00:00:00 2001
From: Brent Simmons <brent@ranchero.com>
Date: Mon, 16 Sep 2024 21:56:55 -0700
Subject: [PATCH] Continue progress on HTMLEntityDecoder.

---
 .../Sources/SAX/HTMLEntityDecoder.swift       | 188 +++++++++++++++++-
 1 file changed, 182 insertions(+), 6 deletions(-)

diff --git a/Modules/Parser/Sources/SAX/HTMLEntityDecoder.swift b/Modules/Parser/Sources/SAX/HTMLEntityDecoder.swift
index 78dbc540f..2c629565f 100644
--- a/Modules/Parser/Sources/SAX/HTMLEntityDecoder.swift
+++ b/Modules/Parser/Sources/SAX/HTMLEntityDecoder.swift
@@ -9,8 +9,6 @@ import Foundation
 
 public final class HTMLEntityDecoder {
 
-	static let ampersandCharacter = Character("&")
-
 	public static func decodedString(_ encodedString: String) -> String {
 
 		let scanner = EntityScanner(string: encodedString)
@@ -19,7 +17,7 @@ public final class HTMLEntityDecoder {
 
 		while true {
 
-			let scannedString = scanner.scanUpTo(Self.ampersandCharacter)
+			let scannedString = scanner.scanUpToAmpersand()
 			if !scannedString.isEmpty {
 				result.append(scannedString)
 			}
@@ -73,12 +71,15 @@ final class EntityScanner {
 		self.count = string.count
 	}
 
+	static let ampersandCharacter = Character("&")
+
 	/// Scans up to `characterToFind` and returns the characters up to (and not including) `characterToFind`.
 	/// - Returns: the scanned portion before `characterToFind`. May be empty string.
-	func scanUpTo(_ characterToFind: Character) -> String {
+	func scanUpToAmpersand() -> String {
 
+		let characterToFind = Self.ampersandCharacter
 		var scanned = ""
-
+		
 		while true {
 
 			guard let ch = currentCharacter else {
@@ -166,8 +167,183 @@ extension String {
 	}
 }
 
-/// rawEntity is assumed not to have opening `&` and closing `;`.
+/// rawEntity may or may not have leading `&` and/or trailing `;` characters.
 private func decodedEntity(_ rawEntity: String) -> String? {
 
+	var s = rawEntity
+
+	if s.hasPrefix("&") {
+		s.removeFirst()
+	}
+	if s.hasSuffix(";") {
+		s.removeLast()
+	}
+
+	if let decodedEntity = entitiesDictionary[s] {
+		return decodedEntity
+	}
+
+	if s.hasPrefix("#x") || s.hasPrefix("#X") { // Hex
+		let scanner = Scanner(string: s)
+			scanner.charactersToBeSkipped = CharacterSet(charactersIn: "#xX")
+		var hexValue: UInt64 = 0
+		if scanner.scanHexInt64(&hexValue) {
+			return stringWithValue(UInt32(hexValue))
+		}
+		return nil
+	}
+
+	else if s.hasPrefix("#") {
+		s.removeFirst()
+		guard let value = UInt32(s), value >= 1 else {
+			return nil
+		}
+		return stringWithValue(value)
+	}
+
 	return nil
 }
+
+private func stringWithValue(_ value: UInt32) -> String? {
+
+	// From WebCore's HTMLEntityParser
+	let windowsLatin1ExtensionArray: [UInt32] = [
+		0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
+		0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
+		0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
+		0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178  // 98-9F
+	]
+
+	var modifiedValue = value
+
+	if (modifiedValue & ~0x1F) == 0x80 { // value >= 128 && value < 160
+		modifiedValue = windowsLatin1ExtensionArray[Int(modifiedValue - 0x80)]
+	}
+
+	modifiedValue = CFSwapInt32HostToLittle(modifiedValue)
+
+	let data = Data(bytes: &modifiedValue, count: MemoryLayout.size(ofValue: modifiedValue))
+
+	return String(data: data, encoding: .utf32LittleEndian)
+}
+
+private let entitiesDictionary =
+	[
+	"AElig": "Æ",
+	"Aacute": "Á",
+	"Acirc": "Â",
+	"Agrave": "À",
+	"Aring": "Å",
+	"Atilde": "Ã",
+	"Auml": "Ä",
+	"Ccedil": "Ç",
+	"Dstrok": "Ð",
+	"ETH": "Ð",
+	"Eacute": "É",
+	"Ecirc": "Ê",
+	"Egrave": "È",
+	"Euml": "Ë",
+	"Iacute": "Í",
+	"Icirc": "Î",
+	"Igrave": "Ì",
+	"Iuml": "Ï",
+	"Ntilde": "Ñ",
+	"Oacute": "Ó",
+	"Ocirc": "Ô",
+	"Ograve": "Ò",
+	"Oslash": "Ø",
+	"Otilde": "Õ",
+	"Ouml": "Ö",
+	"Pi": "Π",
+	"THORN": "Þ",
+	"Uacute": "Ú",
+	"Ucirc": "Û",
+	"Ugrave": "Ù",
+	"Uuml": "Ü",
+	"Yacute": "Y",
+	"aacute": "á",
+	"acirc": "â",
+	"acute": "´",
+	"aelig": "æ",
+	"agrave": "à",
+	"amp": "&",
+	"apos": "'",
+	"aring": "å",
+	"atilde": "ã",
+	"auml": "ä",
+	"brkbar": "¦",
+	"brvbar": "¦",
+	"ccedil": "ç",
+	"cedil": "¸",
+	"cent": "¢",
+	"copy": "©",
+	"curren": "¤",
+	"deg": "°",
+	"die": "¨",
+	"divide": "÷",
+	"eacute": "é",
+	"ecirc": "ê",
+	"egrave": "è",
+	"eth": "ð",
+	"euml": "ë",
+	"euro": "€",
+	"frac12": "½",
+	"frac14": "¼",
+	"frac34": "¾",
+	"gt": ">",
+	"hearts": "♥",
+	"hellip": "…",
+	"iacute": "í",
+	"icirc": "î",
+	"iexcl": "¡",
+	"igrave": "ì",
+	"iquest": "¿",
+	"iuml": "ï",
+	"laquo": "«",
+	"ldquo": "“",
+	"lsquo": "‘",
+	"lt": "<",
+	"macr": "¯",
+	"mdash": "—",
+	"micro": "µ",
+	"middot": "·",
+	"ndash": "–",
+	"not": "¬",
+	"ntilde": "ñ",
+	"oacute": "ó",
+	"ocirc": "ô",
+	"ograve": "ò",
+	"ordf": "ª",
+	"ordm": "º",
+	"oslash": "ø",
+	"otilde": "õ",
+	"ouml": "ö",
+	"para": "¶",
+	"pi": "π",
+	"plusmn": "±",
+	"pound": "£",
+	"quot": "\"",
+	"raquo": "»",
+	"rdquo": "”",
+	"reg": "®",
+	"rsquo": "’",
+	"sect": "§",
+	"shy": stringWithValue(173),
+	"sup1": "¹",
+	"sup2": "²",
+	"sup3": "³",
+	"szlig": "ß",
+	"thorn": "þ",
+	"times": "×",
+	"trade": "™",
+	"uacute": "ú",
+	"ucirc": "û",
+	"ugrave": "ù",
+	"uml": "¨",
+	"uuml": "ü",
+	"yacute": "y",
+	"yen": "¥",
+	"yuml": "ÿ",
+	"infin": "∞",
+	"nbsp": stringWithValue(160)
+	]