Simplify Parser — use one target instead of multiple.

This commit is contained in:
Brent Simmons
2024-09-24 14:45:53 -07:00
parent c48e72c3b2
commit 591e451b69
51 changed files with 14 additions and 218 deletions

View File

@@ -0,0 +1,68 @@
//
// Data+Parser.swift
//
//
// Created by Brent Simmons on 8/24/24.
//
import Foundation
public extension Data {
/// Return true if the data contains a given String.
///
/// Assumes that the data is UTF-8 or similar encoding
/// if its UTF-16 or UTF-32, for instance, this will always return false.
/// Luckily these are rare.
///
/// The String to search for should be something that could be encoded
/// in ASCII  like "<opml" or "<rss". (In other words,
/// the sequence of characters would always be the same in
/// commonly-used encodings.)
func containsASCIIString(_ searchFor: String) -> Bool {
contains(searchFor.utf8)
}
/// Return true if searchFor appears in self.
func contains(_ searchFor: Data) -> Bool {
let searchForCount = searchFor.count
let dataCount = self.count
guard searchForCount > 0, searchForCount <= dataCount else {
return false
}
let searchForInitialByte = searchFor[0]
var found = false
self.withUnsafeBytes { bytes in
let buffer = bytes.bindMemory(to: UInt8.self)
for i in 0...dataCount - searchForCount {
if buffer[i] == searchForInitialByte {
var match = true
for j in 1..<searchForCount {
if buffer[i + j] != searchFor[j] {
match = false
break
}
}
if match {
found = true
return
}
}
}
}
return found
}
}

View File

@@ -0,0 +1,28 @@
//
// Dictionary+Parser.swift
//
//
// Created by Brent Simmons on 8/18/24.
//
import Foundation
public extension Dictionary where Key == String, Value == String {
func object(forCaseInsensitiveKey key: String) -> String? {
if let object = self[key] {
return object
}
let lowercaseKey = key.lowercased()
for (oneKey, oneValue) in self {
if lowercaseKey.caseInsensitiveCompare(oneKey) == .orderedSame {
return oneValue
}
}
return nil
}
}

View File

@@ -0,0 +1,23 @@
//
// String+RSParser.swift
// RSParser
//
// Created by Nate Weaver on 2020-01-19.
// Copyright © 2020 Ranchero Software, LLC. All rights reserved.
//
import Foundation
public extension String {
var nilIfEmptyOrWhitespace: String? {
return self.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty ? nil : self
}
static func isEmptyOrNil(_ s: String?) -> Bool {
if let s {
return s.isEmpty
}
return true
}
}

View File

@@ -0,0 +1,349 @@
//
// HTMLEntityDecoder.swift
//
//
// Created by Brent Simmons on 9/14/24.
//
import Foundation
public final class HTMLEntityDecoder {
public static func decodedString(_ encodedString: String) -> String {
let scanner = EntityScanner(string: encodedString)
var result = ""
var didDecodeAtLeastOneEntity = false
while true {
let scannedString = scanner.scanUpToAmpersand()
if !scannedString.isEmpty {
result.append(scannedString)
}
if scanner.isAtEnd {
break
}
let savedScanLocation = scanner.scanLocation
if let decodedEntity = scanner.scanEntityValue() {
result.append(decodedEntity)
didDecodeAtLeastOneEntity = true
}
else {
result.append("&")
scanner.scanLocation = savedScanLocation + 1
}
if scanner.isAtEnd {
break
}
}
if !didDecodeAtLeastOneEntity { // No entities decoded?
return encodedString
}
return result
}
}
/// Purpose-built version of NSScanner, which has deprecated the parts we want to use.
final class EntityScanner {
let string: String
let count: Int
var scanLocation = 0
var isAtEnd: Bool {
scanLocation >= count
}
var currentCharacter: Character? {
guard !isAtEnd else {
return nil
}
return string.characterAtIntIndex(scanLocation)
}
init(string: String) {
self.string = string
self.count = string.count
}
static let ampersandCharacter = Character("&")
/// Scans up to `characterToFind` and returns the characters up to (and not including) `characterToFind`.
/// - Returns: the scanned portion before `characterToFind`. May be empty string.
func scanUpToAmpersand() -> String {
let characterToFind = Self.ampersandCharacter
var scanned = ""
while true {
guard let ch = currentCharacter else {
break
}
scanLocation += 1
if ch == characterToFind {
break
}
else {
scanned.append(ch)
}
}
return scanned
}
static let semicolonCharacter = Character(";")
func scanEntityValue() -> String? {
let initialScanLocation = scanLocation
let maxEntityLength = 20 // Its probably smaller, but this is just for sanity.
while true {
guard let ch = currentCharacter else {
break
}
if CharacterSet.whitespacesAndNewlines.contains(ch.unicodeScalars.first!) {
break
}
if ch == Self.semicolonCharacter {
let entityRange = initialScanLocation..<scanLocation
guard let entity = string.substring(intRange: entityRange), let decodedEntity = decodedEntity(entity) else {
assertionFailure("Unexpected failure scanning entity in scanEntityValue.")
scanLocation = initialScanLocation + 1
return nil
}
scanLocation = scanLocation + 1
return decodedEntity
}
scanLocation += 1
if scanLocation - initialScanLocation > maxEntityLength {
break
}
if isAtEnd {
break
}
}
return nil
}
}
extension String {
func indexForInt(_ i: Int) -> Index? {
index(startIndex, offsetBy: i, limitedBy: endIndex)
}
func characterAtIntIndex(_ i: Int) -> Character? {
guard let index = indexForInt(i) else {
return nil
}
return self[index]
}
func substring(intRange: Range<Int>) -> String? {
guard let rangeLower = indexForInt(intRange.lowerBound) else {
return nil
}
guard let rangeUpper = indexForInt(intRange.upperBound) else {
return nil
}
return String(self[rangeLower..<rangeUpper])
}
}
/// rawEntity may or may not have leading `&` and/or trailing `;` characters.
private func decodedEntity(_ rawEntity: String) -> String? {
var s = rawEntity
if s.hasPrefix("&") {
s.removeFirst()
}
if s.hasSuffix(";") {
s.removeLast()
}
if let decodedEntity = entitiesDictionary[s] {
return decodedEntity
}
if s.hasPrefix("#x") || s.hasPrefix("#X") { // Hex
let scanner = Scanner(string: s)
scanner.charactersToBeSkipped = CharacterSet(charactersIn: "#xX")
var hexValue: UInt64 = 0
if scanner.scanHexInt64(&hexValue) {
return stringWithValue(UInt32(hexValue))
}
return nil
}
else if s.hasPrefix("#") {
s.removeFirst()
guard let value = UInt32(s), value >= 1 else {
return nil
}
return stringWithValue(value)
}
return nil
}
private func stringWithValue(_ value: UInt32) -> String? {
// From WebCore's HTMLEntityParser
let windowsLatin1ExtensionArray: [UInt32] = [
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 // 98-9F
]
var modifiedValue = value
if (modifiedValue & ~0x1F) == 0x80 { // value >= 128 && value < 160
modifiedValue = windowsLatin1ExtensionArray[Int(modifiedValue - 0x80)]
}
modifiedValue = CFSwapInt32HostToLittle(modifiedValue)
let data = Data(bytes: &modifiedValue, count: MemoryLayout.size(ofValue: modifiedValue))
return String(data: data, encoding: .utf32LittleEndian)
}
private let entitiesDictionary =
[
"AElig": "Æ",
"Aacute": "Á",
"Acirc": "Â",
"Agrave": "À",
"Aring": "Å",
"Atilde": "Ã",
"Auml": "Ä",
"Ccedil": "Ç",
"Dstrok": "Ð",
"ETH": "Ð",
"Eacute": "É",
"Ecirc": "Ê",
"Egrave": "È",
"Euml": "Ë",
"Iacute": "Í",
"Icirc": "Î",
"Igrave": "Ì",
"Iuml": "Ï",
"Ntilde": "Ñ",
"Oacute": "Ó",
"Ocirc": "Ô",
"Ograve": "Ò",
"Oslash": "Ø",
"Otilde": "Õ",
"Ouml": "Ö",
"Pi": "Π",
"THORN": "Þ",
"Uacute": "Ú",
"Ucirc": "Û",
"Ugrave": "Ù",
"Uuml": "Ü",
"Yacute": "Y",
"aacute": "á",
"acirc": "â",
"acute": "´",
"aelig": "æ",
"agrave": "à",
"amp": "&",
"apos": "'",
"aring": "å",
"atilde": "ã",
"auml": "ä",
"brkbar": "¦",
"brvbar": "¦",
"ccedil": "ç",
"cedil": "¸",
"cent": "¢",
"copy": "©",
"curren": "¤",
"deg": "°",
"die": "¨",
"divide": "÷",
"eacute": "é",
"ecirc": "ê",
"egrave": "è",
"eth": "ð",
"euml": "ë",
"euro": "",
"frac12": "½",
"frac14": "¼",
"frac34": "¾",
"gt": ">",
"hearts": "",
"hellip": "",
"iacute": "í",
"icirc": "î",
"iexcl": "¡",
"igrave": "ì",
"iquest": "¿",
"iuml": "ï",
"laquo": "«",
"ldquo": "",
"lsquo": "",
"lt": "<",
"macr": "¯",
"mdash": "",
"micro": "µ",
"middot": "·",
"ndash": "",
"not": "¬",
"ntilde": "ñ",
"oacute": "ó",
"ocirc": "ô",
"ograve": "ò",
"ordf": "ª",
"ordm": "º",
"oslash": "ø",
"otilde": "õ",
"ouml": "ö",
"para": "",
"pi": "π",
"plusmn": "±",
"pound": "£",
"quot": "\"",
"raquo": "»",
"rdquo": "",
"reg": "®",
"rsquo": "",
"sect": "§",
"shy": stringWithValue(173),
"sup1": "¹",
"sup2": "²",
"sup3": "³",
"szlig": "ß",
"thorn": "þ",
"times": "×",
"trade": "",
"uacute": "ú",
"ucirc": "û",
"ugrave": "ù",
"uml": "¨",
"uuml": "ü",
"yacute": "y",
"yen": "¥",
"yuml": "ÿ",
"infin": "",
"nbsp": stringWithValue(160)
]

View File

@@ -0,0 +1,19 @@
//
// ParserData.swift
//
//
// Created by Brent Simmons on 8/18/24.
//
import Foundation
public struct ParserData: Sendable {
public let url: String
public let data: Data
public init(url: String, data: Data) {
self.url = url
self.data = data
}
}

View File

@@ -0,0 +1,200 @@
//
// SAXHTMLParser.swift
//
//
// Created by Brent Simmons on 8/26/24.
//
import Foundation
import FoundationExtras
import libxml2
public protocol SAXHTMLParserDelegate: AnyObject {
func saxHTMLParser(_: SAXHTMLParser, startElement: XMLPointer, attributes: UnsafePointer<XMLPointer?>?)
func saxHTMLParser(_: SAXHTMLParser, endElement: XMLPointer)
// Length is guaranteed to be greater than 0.
func saxHTMLParser(_: SAXHTMLParser, charactersFound: XMLPointer, count: Int)
}
public final class SAXHTMLParser {
fileprivate let delegate: SAXHTMLParserDelegate
public var currentCharacters: Data? { // UTF-8 encoded
guard storingCharacters else {
return nil
}
return characters
}
// Conveniences to get string version of currentCharacters
public var currentString: String? {
guard let d = currentCharacters, !d.isEmpty else {
return nil
}
return String(data: d, encoding: .utf8)
}
public var currentStringWithTrimmedWhitespace: String? {
guard let s = currentString else {
return nil
}
return s.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines)
}
private var data: Data
private var storingCharacters = false
private var characters = Data()
public init(delegate: SAXHTMLParserDelegate, data: Data) {
self.delegate = delegate
self.data = data
}
public func parse() {
guard !data.isEmpty else {
return
}
data.withUnsafeBytes { bufferPointer in
guard let bytes = bufferPointer.bindMemory(to: CChar.self).baseAddress else {
return
}
let characterEncoding = xmlDetectCharEncoding(bytes, Int32(data.count))
let context = htmlCreatePushParserCtxt(&saxHandlerStruct, Unmanaged.passUnretained(self).toOpaque(), nil, 0, nil, characterEncoding)
htmlCtxtUseOptions(context, Int32(HTML_PARSE_RECOVER.rawValue | HTML_PARSE_NONET.rawValue | HTML_PARSE_COMPACT.rawValue | HTML_PARSE_NOERROR.rawValue | HTML_PARSE_NOWARNING.rawValue))
htmlParseChunk(context, bytes, Int32(data.count), 0)
htmlParseChunk(context, nil, 0, 1)
htmlFreeParserCtxt(context)
}
}
/// Delegate can call from xmlStartElement. Characters will be available in xmlEndElement as currentCharacters property. Storing characters is stopped after each xmlEndElement.
public func beginStoringCharacters() {
storingCharacters = true
characters.count = 0
}
public func endStoringCharacters() {
storingCharacters = false
characters.count = 0
}
public func attributesDictionary(_ attributes: UnsafePointer<XMLPointer?>?) -> StringDictionary? {
guard let attributes else {
return nil
}
var dictionary = [String: String]()
var ix = 0
var currentKey: String? = nil
while true {
let oneAttribute = attributes[ix]
ix += 1
if currentKey == nil && oneAttribute == nil {
break
}
if currentKey == nil {
if let oneAttribute {
currentKey = String(cString: oneAttribute)
}
} else {
let value: String?
if let oneAttribute {
value = String(cString: oneAttribute)
} else {
value = nil
}
dictionary[currentKey!] = value ?? ""
currentKey = nil
}
}
return dictionary
}
}
private extension SAXHTMLParser {
func charactersFound(_ htmlCharacters: XMLPointer, count: Int) {
if storingCharacters {
characters.append(htmlCharacters, count: count)
}
delegate.saxHTMLParser(self, charactersFound: htmlCharacters, count: count)
}
func startElement(_ name: XMLPointer, attributes: UnsafePointer<XMLPointer?>?) {
delegate.saxHTMLParser(self, startElement: name, attributes: attributes)
}
func endElement(_ name: XMLPointer) {
delegate.saxHTMLParser(self, endElement: name)
endStoringCharacters()
}
}
private func parser(from context: UnsafeMutableRawPointer) -> SAXHTMLParser {
Unmanaged<SAXHTMLParser>.fromOpaque(context).takeUnretainedValue()
}
nonisolated(unsafe) private var saxHandlerStruct: xmlSAXHandler = {
var handler = htmlSAXHandler()
handler.characters = { (context: UnsafeMutableRawPointer?, ch: XMLPointer?, len: CInt) in
guard let context, let ch, len > 0 else {
return
}
let parser = parser(from: context)
parser.charactersFound(ch, count: Int(len))
}
handler.startElement = { (context: UnsafeMutableRawPointer?, name: XMLPointer?, attributes: UnsafeMutablePointer<XMLPointer?>?) in
guard let context, let name else {
return
}
let parser = parser(from: context)
parser.startElement(name, attributes: attributes)
}
handler.endElement = { (context: UnsafeMutableRawPointer?, name: XMLPointer?) in
guard let context, let name else {
return
}
let parser = parser(from: context)
parser.endElement(name)
}
return handler
}()

View File

@@ -0,0 +1,204 @@
//
// SAXParser.swift.
//
//
// Created by Brent Simmons on 8/12/24.
//
import Foundation
import FoundationExtras
import libxml2
public typealias XMLPointer = UnsafePointer<xmlChar>
public protocol SAXParserDelegate {
func saxParser(_: SAXParser, xmlStartElement: XMLPointer, prefix: XMLPointer?, uri: XMLPointer?, namespaceCount: Int, namespaces: UnsafePointer<XMLPointer?>?, attributeCount: Int, attributesDefaultedCount: Int, attributes: UnsafePointer<XMLPointer?>?)
func saxParser(_: SAXParser, xmlEndElement: XMLPointer, prefix: XMLPointer?, uri: XMLPointer?)
func saxParser(_: SAXParser, xmlCharactersFound: XMLPointer, count: Int)
}
public final class SAXParser {
fileprivate let delegate: SAXParserDelegate
public var currentCharacters: Data? { // UTF-8 encoded
guard storingCharacters else {
return nil
}
return characters
}
// Conveniences to get string version of currentCharacters
public var currentString: String? {
guard let d = currentCharacters, !d.isEmpty else {
return nil
}
return String(data: d, encoding: .utf8)
}
public var currentStringWithTrimmedWhitespace: String? {
guard let s = currentString else {
return nil
}
return s.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines)
}
private var data: Data
private var storingCharacters = false
private var characters = Data()
public init(delegate: SAXParserDelegate, data: Data) {
self.delegate = delegate
self.data = data
}
public func parse() {
guard !data.isEmpty else {
return
}
let context = xmlCreatePushParserCtxt(&saxHandlerStruct, Unmanaged.passUnretained(self).toOpaque(), nil, 0, nil)
xmlCtxtUseOptions(context, Int32(XML_PARSE_RECOVER.rawValue | XML_PARSE_NOENT.rawValue))
data.withUnsafeBytes { bufferPointer in
if let bytes = bufferPointer.bindMemory(to: CChar.self).baseAddress {
xmlParseChunk(context, bytes, Int32(data.count), 0)
}
}
xmlParseChunk(context, nil, 0, 1)
xmlFreeParserCtxt(context)
}
/// Delegate can call from xmlStartElement. Characters will be available in xmlEndElement as currentCharacters property. Storing characters is stopped after each xmlEndElement.
public func beginStoringCharacters() {
storingCharacters = true
characters.count = 0
}
public func endStoringCharacters() {
storingCharacters = false
characters.count = 0
}
public func attributesDictionary(_ attributes: UnsafePointer<XMLPointer?>?, attributeCount: Int) -> StringDictionary? {
guard attributeCount > 0, let attributes else {
return nil
}
var dictionary = [String: String]()
let fieldCount = 5
var i = 0, j = 0
while i < attributeCount {
guard let attribute = attributes[j] else {
continue
}
let prefix = attributes[j + 1]
var attributeName = String(cString: attribute)
if let prefix {
let attributePrefix = String(cString: prefix)
attributeName = "\(attributePrefix):\(attributeName)"
}
guard let valueStart = attributes[j + 3], let valueEnd = attributes[j + 4] else {
continue
}
let valueCount = valueEnd - valueStart
let value = String(bytes: UnsafeRawBufferPointer(start: valueStart, count: Int(valueCount)), encoding: .utf8)
if let value {
dictionary[attributeName] = value
}
i += 1
j += fieldCount
}
return dictionary
}
}
private extension SAXParser {
func charactersFound(_ xmlCharacters: XMLPointer, count: Int) {
if storingCharacters {
characters.append(xmlCharacters, count: count)
}
delegate.saxParser(self, xmlCharactersFound: xmlCharacters, count: count)
}
func startElement(_ name: XMLPointer, prefix: XMLPointer?, uri: XMLPointer?, namespaceCount: Int, namespaces: UnsafePointer<XMLPointer?>?, attributeCount: Int, attributesDefaultedCount: Int, attributes: UnsafePointer<XMLPointer?>?) {
delegate.saxParser(self, xmlStartElement: name, prefix: prefix, uri: uri, namespaceCount: namespaceCount, namespaces: namespaces, attributeCount: attributeCount, attributesDefaultedCount: attributesDefaultedCount, attributes: attributes)
}
func endElement(_ name: XMLPointer, prefix: XMLPointer?, uri: XMLPointer?) {
delegate.saxParser(self, xmlEndElement: name, prefix: prefix, uri: uri)
endStoringCharacters()
}
}
private func startElement(_ context: UnsafeMutableRawPointer?, name: XMLPointer?, prefix: XMLPointer?, URI: XMLPointer?, nb_namespaces: CInt, namespaces: UnsafeMutablePointer<XMLPointer?>?, nb_attributes: CInt, nb_defaulted: CInt, attributes: UnsafeMutablePointer<XMLPointer?>?) {
guard let context, let name else {
return
}
let parser = parser(from: context)
parser.startElement(name, prefix: prefix, uri: URI, namespaceCount: Int(nb_namespaces), namespaces: namespaces, attributeCount: Int(nb_attributes), attributesDefaultedCount: Int(nb_defaulted), attributes: attributes)
}
private func endElement(_ context: UnsafeMutableRawPointer?, name: XMLPointer?, prefix: XMLPointer?, URI: XMLPointer?) {
guard let context, let name else {
return
}
let parser = parser(from: context)
parser.endElement(name, prefix: prefix, uri: URI)
}
private func charactersFound(_ context: UnsafeMutableRawPointer?, ch: XMLPointer?, len: CInt) {
guard let context, let ch, len > 0 else {
return
}
let parser = parser(from: context)
parser.charactersFound(ch, count: Int(len))
}
private func parser(from context: UnsafeMutableRawPointer) -> SAXParser {
Unmanaged<SAXParser>.fromOpaque(context).takeUnretainedValue()
}
nonisolated(unsafe) private var saxHandlerStruct: xmlSAXHandler = {
var handler = xmlSAXHandler()
handler.characters = charactersFound
handler.startElementNs = startElement
handler.endElementNs = endElement
handler.initialized = XML_SAX2_MAGIC
return handler
}()

View File

@@ -0,0 +1,41 @@
//
// SAXUtilities.swift
//
//
// Created by Brent Simmons on 8/26/24.
//
import Foundation
import libxml2
public func SAXEqualTags(_ localName: XMLPointer, _ tag: ContiguousArray<Int8>) -> Bool {
return tag.withUnsafeBufferPointer { bufferPointer in
let tagCount = tag.count // includes 0 terminator
for i in 0..<tagCount - 1 {
let localNameCharacter = localName[i]
if localNameCharacter == 0 {
return false
}
let tagCharacter = UInt8(tag[i])
if localNameCharacter != tagCharacter {
return false
}
}
// localName might actually be longer  make sure its the same length as tag.
return localName[tagCount - 1] == 0
}
}
public extension String {
init?(xmlPointer: XMLPointer, count: Int? = nil) {
let d = Data(bytes: xmlPointer, count: count ?? strlen(xmlPointer))
self.init(data: d, encoding: .utf8)
}
}