mirror of
https://github.com/Ranchero-Software/NetNewsWire
synced 2025-08-12 06:26:36 +00:00
Simplify Parser — use one target instead of multiple.
This commit is contained in:
68
Modules/Parser/Sources/Parser/SAX/Extensions/Data+SAX.swift
Normal file
68
Modules/Parser/Sources/Parser/SAX/Extensions/Data+SAX.swift
Normal file
@@ -0,0 +1,68 @@
|
||||
//
|
||||
// Data+Parser.swift
|
||||
//
|
||||
//
|
||||
// Created by Brent Simmons on 8/24/24.
|
||||
//
|
||||
|
||||
import Foundation
|
||||
|
||||
public extension Data {
|
||||
|
||||
/// Return true if the data contains a given String.
|
||||
///
|
||||
/// Assumes that the data is UTF-8 or similar encoding —
|
||||
/// if it’s UTF-16 or UTF-32, for instance, this will always return false.
|
||||
/// Luckily these are rare.
|
||||
///
|
||||
/// The String to search for should be something that could be encoded
|
||||
/// in ASCII — like "<opml" or "<rss". (In other words,
|
||||
/// the sequence of characters would always be the same in
|
||||
/// commonly-used encodings.)
|
||||
func containsASCIIString(_ searchFor: String) -> Bool {
|
||||
|
||||
contains(searchFor.utf8)
|
||||
}
|
||||
|
||||
/// Return true if searchFor appears in self.
|
||||
func contains(_ searchFor: Data) -> Bool {
|
||||
|
||||
let searchForCount = searchFor.count
|
||||
let dataCount = self.count
|
||||
|
||||
guard searchForCount > 0, searchForCount <= dataCount else {
|
||||
return false
|
||||
}
|
||||
|
||||
let searchForInitialByte = searchFor[0]
|
||||
var found = false
|
||||
|
||||
self.withUnsafeBytes { bytes in
|
||||
|
||||
let buffer = bytes.bindMemory(to: UInt8.self)
|
||||
|
||||
for i in 0...dataCount - searchForCount {
|
||||
|
||||
if buffer[i] == searchForInitialByte {
|
||||
|
||||
var match = true
|
||||
|
||||
for j in 1..<searchForCount {
|
||||
|
||||
if buffer[i + j] != searchFor[j] {
|
||||
match = false
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if match {
|
||||
found = true
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return found
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
//
|
||||
// Dictionary+Parser.swift
|
||||
//
|
||||
//
|
||||
// Created by Brent Simmons on 8/18/24.
|
||||
//
|
||||
|
||||
import Foundation
|
||||
|
||||
public extension Dictionary where Key == String, Value == String {
|
||||
|
||||
func object(forCaseInsensitiveKey key: String) -> String? {
|
||||
|
||||
if let object = self[key] {
|
||||
return object
|
||||
}
|
||||
|
||||
let lowercaseKey = key.lowercased()
|
||||
|
||||
for (oneKey, oneValue) in self {
|
||||
if lowercaseKey.caseInsensitiveCompare(oneKey) == .orderedSame {
|
||||
return oneValue
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
//
|
||||
// String+RSParser.swift
|
||||
// RSParser
|
||||
//
|
||||
// Created by Nate Weaver on 2020-01-19.
|
||||
// Copyright © 2020 Ranchero Software, LLC. All rights reserved.
|
||||
//
|
||||
|
||||
import Foundation
|
||||
|
||||
public extension String {
|
||||
|
||||
var nilIfEmptyOrWhitespace: String? {
|
||||
return self.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty ? nil : self
|
||||
}
|
||||
|
||||
static func isEmptyOrNil(_ s: String?) -> Bool {
|
||||
if let s {
|
||||
return s.isEmpty
|
||||
}
|
||||
return true
|
||||
}
|
||||
}
|
||||
349
Modules/Parser/Sources/Parser/SAX/HTMLEntityDecoder.swift
Normal file
349
Modules/Parser/Sources/Parser/SAX/HTMLEntityDecoder.swift
Normal file
@@ -0,0 +1,349 @@
|
||||
//
|
||||
// HTMLEntityDecoder.swift
|
||||
//
|
||||
//
|
||||
// Created by Brent Simmons on 9/14/24.
|
||||
//
|
||||
|
||||
import Foundation
|
||||
|
||||
public final class HTMLEntityDecoder {
|
||||
|
||||
public static func decodedString(_ encodedString: String) -> String {
|
||||
|
||||
let scanner = EntityScanner(string: encodedString)
|
||||
var result = ""
|
||||
var didDecodeAtLeastOneEntity = false
|
||||
|
||||
while true {
|
||||
|
||||
let scannedString = scanner.scanUpToAmpersand()
|
||||
if !scannedString.isEmpty {
|
||||
result.append(scannedString)
|
||||
}
|
||||
if scanner.isAtEnd {
|
||||
break
|
||||
}
|
||||
|
||||
let savedScanLocation = scanner.scanLocation
|
||||
|
||||
if let decodedEntity = scanner.scanEntityValue() {
|
||||
result.append(decodedEntity)
|
||||
didDecodeAtLeastOneEntity = true
|
||||
}
|
||||
else {
|
||||
result.append("&")
|
||||
scanner.scanLocation = savedScanLocation + 1
|
||||
}
|
||||
|
||||
if scanner.isAtEnd {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !didDecodeAtLeastOneEntity { // No entities decoded?
|
||||
return encodedString
|
||||
}
|
||||
return result
|
||||
}
|
||||
}
|
||||
|
||||
/// Purpose-built version of NSScanner, which has deprecated the parts we want to use.
|
||||
final class EntityScanner {
|
||||
|
||||
let string: String
|
||||
let count: Int
|
||||
var scanLocation = 0
|
||||
|
||||
var isAtEnd: Bool {
|
||||
scanLocation >= count
|
||||
}
|
||||
|
||||
var currentCharacter: Character? {
|
||||
guard !isAtEnd else {
|
||||
return nil
|
||||
}
|
||||
return string.characterAtIntIndex(scanLocation)
|
||||
}
|
||||
|
||||
init(string: String) {
|
||||
self.string = string
|
||||
self.count = string.count
|
||||
}
|
||||
|
||||
static let ampersandCharacter = Character("&")
|
||||
|
||||
/// Scans up to `characterToFind` and returns the characters up to (and not including) `characterToFind`.
|
||||
/// - Returns: the scanned portion before `characterToFind`. May be empty string.
|
||||
func scanUpToAmpersand() -> String {
|
||||
|
||||
let characterToFind = Self.ampersandCharacter
|
||||
var scanned = ""
|
||||
|
||||
while true {
|
||||
|
||||
guard let ch = currentCharacter else {
|
||||
break
|
||||
}
|
||||
scanLocation += 1
|
||||
|
||||
if ch == characterToFind {
|
||||
break
|
||||
}
|
||||
else {
|
||||
scanned.append(ch)
|
||||
}
|
||||
}
|
||||
|
||||
return scanned
|
||||
}
|
||||
|
||||
static let semicolonCharacter = Character(";")
|
||||
|
||||
func scanEntityValue() -> String? {
|
||||
|
||||
let initialScanLocation = scanLocation
|
||||
let maxEntityLength = 20 // It’s probably smaller, but this is just for sanity.
|
||||
|
||||
while true {
|
||||
|
||||
guard let ch = currentCharacter else {
|
||||
break
|
||||
}
|
||||
if CharacterSet.whitespacesAndNewlines.contains(ch.unicodeScalars.first!) {
|
||||
break
|
||||
}
|
||||
|
||||
if ch == Self.semicolonCharacter {
|
||||
let entityRange = initialScanLocation..<scanLocation
|
||||
guard let entity = string.substring(intRange: entityRange), let decodedEntity = decodedEntity(entity) else {
|
||||
assertionFailure("Unexpected failure scanning entity in scanEntityValue.")
|
||||
scanLocation = initialScanLocation + 1
|
||||
return nil
|
||||
}
|
||||
scanLocation = scanLocation + 1
|
||||
return decodedEntity
|
||||
}
|
||||
|
||||
scanLocation += 1
|
||||
if scanLocation - initialScanLocation > maxEntityLength {
|
||||
break
|
||||
}
|
||||
if isAtEnd {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
extension String {
|
||||
|
||||
func indexForInt(_ i: Int) -> Index? {
|
||||
|
||||
index(startIndex, offsetBy: i, limitedBy: endIndex)
|
||||
}
|
||||
|
||||
func characterAtIntIndex(_ i: Int) -> Character? {
|
||||
|
||||
guard let index = indexForInt(i) else {
|
||||
return nil
|
||||
}
|
||||
|
||||
return self[index]
|
||||
}
|
||||
|
||||
func substring(intRange: Range<Int>) -> String? {
|
||||
|
||||
guard let rangeLower = indexForInt(intRange.lowerBound) else {
|
||||
return nil
|
||||
}
|
||||
guard let rangeUpper = indexForInt(intRange.upperBound) else {
|
||||
return nil
|
||||
}
|
||||
|
||||
return String(self[rangeLower..<rangeUpper])
|
||||
}
|
||||
}
|
||||
|
||||
/// rawEntity may or may not have leading `&` and/or trailing `;` characters.
|
||||
private func decodedEntity(_ rawEntity: String) -> String? {
|
||||
|
||||
var s = rawEntity
|
||||
|
||||
if s.hasPrefix("&") {
|
||||
s.removeFirst()
|
||||
}
|
||||
if s.hasSuffix(";") {
|
||||
s.removeLast()
|
||||
}
|
||||
|
||||
if let decodedEntity = entitiesDictionary[s] {
|
||||
return decodedEntity
|
||||
}
|
||||
|
||||
if s.hasPrefix("#x") || s.hasPrefix("#X") { // Hex
|
||||
let scanner = Scanner(string: s)
|
||||
scanner.charactersToBeSkipped = CharacterSet(charactersIn: "#xX")
|
||||
var hexValue: UInt64 = 0
|
||||
if scanner.scanHexInt64(&hexValue) {
|
||||
return stringWithValue(UInt32(hexValue))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
else if s.hasPrefix("#") {
|
||||
s.removeFirst()
|
||||
guard let value = UInt32(s), value >= 1 else {
|
||||
return nil
|
||||
}
|
||||
return stringWithValue(value)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
private func stringWithValue(_ value: UInt32) -> String? {
|
||||
|
||||
// From WebCore's HTMLEntityParser
|
||||
let windowsLatin1ExtensionArray: [UInt32] = [
|
||||
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
|
||||
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
|
||||
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
|
||||
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 // 98-9F
|
||||
]
|
||||
|
||||
var modifiedValue = value
|
||||
|
||||
if (modifiedValue & ~0x1F) == 0x80 { // value >= 128 && value < 160
|
||||
modifiedValue = windowsLatin1ExtensionArray[Int(modifiedValue - 0x80)]
|
||||
}
|
||||
|
||||
modifiedValue = CFSwapInt32HostToLittle(modifiedValue)
|
||||
|
||||
let data = Data(bytes: &modifiedValue, count: MemoryLayout.size(ofValue: modifiedValue))
|
||||
|
||||
return String(data: data, encoding: .utf32LittleEndian)
|
||||
}
|
||||
|
||||
private let entitiesDictionary =
|
||||
[
|
||||
"AElig": "Æ",
|
||||
"Aacute": "Á",
|
||||
"Acirc": "Â",
|
||||
"Agrave": "À",
|
||||
"Aring": "Å",
|
||||
"Atilde": "Ã",
|
||||
"Auml": "Ä",
|
||||
"Ccedil": "Ç",
|
||||
"Dstrok": "Ð",
|
||||
"ETH": "Ð",
|
||||
"Eacute": "É",
|
||||
"Ecirc": "Ê",
|
||||
"Egrave": "È",
|
||||
"Euml": "Ë",
|
||||
"Iacute": "Í",
|
||||
"Icirc": "Î",
|
||||
"Igrave": "Ì",
|
||||
"Iuml": "Ï",
|
||||
"Ntilde": "Ñ",
|
||||
"Oacute": "Ó",
|
||||
"Ocirc": "Ô",
|
||||
"Ograve": "Ò",
|
||||
"Oslash": "Ø",
|
||||
"Otilde": "Õ",
|
||||
"Ouml": "Ö",
|
||||
"Pi": "Π",
|
||||
"THORN": "Þ",
|
||||
"Uacute": "Ú",
|
||||
"Ucirc": "Û",
|
||||
"Ugrave": "Ù",
|
||||
"Uuml": "Ü",
|
||||
"Yacute": "Y",
|
||||
"aacute": "á",
|
||||
"acirc": "â",
|
||||
"acute": "´",
|
||||
"aelig": "æ",
|
||||
"agrave": "à",
|
||||
"amp": "&",
|
||||
"apos": "'",
|
||||
"aring": "å",
|
||||
"atilde": "ã",
|
||||
"auml": "ä",
|
||||
"brkbar": "¦",
|
||||
"brvbar": "¦",
|
||||
"ccedil": "ç",
|
||||
"cedil": "¸",
|
||||
"cent": "¢",
|
||||
"copy": "©",
|
||||
"curren": "¤",
|
||||
"deg": "°",
|
||||
"die": "¨",
|
||||
"divide": "÷",
|
||||
"eacute": "é",
|
||||
"ecirc": "ê",
|
||||
"egrave": "è",
|
||||
"eth": "ð",
|
||||
"euml": "ë",
|
||||
"euro": "€",
|
||||
"frac12": "½",
|
||||
"frac14": "¼",
|
||||
"frac34": "¾",
|
||||
"gt": ">",
|
||||
"hearts": "♥",
|
||||
"hellip": "…",
|
||||
"iacute": "í",
|
||||
"icirc": "î",
|
||||
"iexcl": "¡",
|
||||
"igrave": "ì",
|
||||
"iquest": "¿",
|
||||
"iuml": "ï",
|
||||
"laquo": "«",
|
||||
"ldquo": "“",
|
||||
"lsquo": "‘",
|
||||
"lt": "<",
|
||||
"macr": "¯",
|
||||
"mdash": "—",
|
||||
"micro": "µ",
|
||||
"middot": "·",
|
||||
"ndash": "–",
|
||||
"not": "¬",
|
||||
"ntilde": "ñ",
|
||||
"oacute": "ó",
|
||||
"ocirc": "ô",
|
||||
"ograve": "ò",
|
||||
"ordf": "ª",
|
||||
"ordm": "º",
|
||||
"oslash": "ø",
|
||||
"otilde": "õ",
|
||||
"ouml": "ö",
|
||||
"para": "¶",
|
||||
"pi": "π",
|
||||
"plusmn": "±",
|
||||
"pound": "£",
|
||||
"quot": "\"",
|
||||
"raquo": "»",
|
||||
"rdquo": "”",
|
||||
"reg": "®",
|
||||
"rsquo": "’",
|
||||
"sect": "§",
|
||||
"shy": stringWithValue(173),
|
||||
"sup1": "¹",
|
||||
"sup2": "²",
|
||||
"sup3": "³",
|
||||
"szlig": "ß",
|
||||
"thorn": "þ",
|
||||
"times": "×",
|
||||
"trade": "™",
|
||||
"uacute": "ú",
|
||||
"ucirc": "û",
|
||||
"ugrave": "ù",
|
||||
"uml": "¨",
|
||||
"uuml": "ü",
|
||||
"yacute": "y",
|
||||
"yen": "¥",
|
||||
"yuml": "ÿ",
|
||||
"infin": "∞",
|
||||
"nbsp": stringWithValue(160)
|
||||
]
|
||||
19
Modules/Parser/Sources/Parser/SAX/ParserData.swift
Normal file
19
Modules/Parser/Sources/Parser/SAX/ParserData.swift
Normal file
@@ -0,0 +1,19 @@
|
||||
//
|
||||
// ParserData.swift
|
||||
//
|
||||
//
|
||||
// Created by Brent Simmons on 8/18/24.
|
||||
//
|
||||
|
||||
import Foundation
|
||||
|
||||
public struct ParserData: Sendable {
|
||||
|
||||
public let url: String
|
||||
public let data: Data
|
||||
|
||||
public init(url: String, data: Data) {
|
||||
self.url = url
|
||||
self.data = data
|
||||
}
|
||||
}
|
||||
200
Modules/Parser/Sources/Parser/SAX/SAXHTMLParser.swift
Normal file
200
Modules/Parser/Sources/Parser/SAX/SAXHTMLParser.swift
Normal file
@@ -0,0 +1,200 @@
|
||||
//
|
||||
// SAXHTMLParser.swift
|
||||
//
|
||||
//
|
||||
// Created by Brent Simmons on 8/26/24.
|
||||
//
|
||||
|
||||
import Foundation
|
||||
import FoundationExtras
|
||||
import libxml2
|
||||
|
||||
public protocol SAXHTMLParserDelegate: AnyObject {
|
||||
|
||||
func saxHTMLParser(_: SAXHTMLParser, startElement: XMLPointer, attributes: UnsafePointer<XMLPointer?>?)
|
||||
|
||||
func saxHTMLParser(_: SAXHTMLParser, endElement: XMLPointer)
|
||||
|
||||
// Length is guaranteed to be greater than 0.
|
||||
func saxHTMLParser(_: SAXHTMLParser, charactersFound: XMLPointer, count: Int)
|
||||
}
|
||||
|
||||
public final class SAXHTMLParser {
|
||||
|
||||
fileprivate let delegate: SAXHTMLParserDelegate
|
||||
|
||||
public var currentCharacters: Data? { // UTF-8 encoded
|
||||
|
||||
guard storingCharacters else {
|
||||
return nil
|
||||
}
|
||||
return characters
|
||||
}
|
||||
|
||||
// Conveniences to get string version of currentCharacters
|
||||
|
||||
public var currentString: String? {
|
||||
|
||||
guard let d = currentCharacters, !d.isEmpty else {
|
||||
return nil
|
||||
}
|
||||
return String(data: d, encoding: .utf8)
|
||||
}
|
||||
|
||||
public var currentStringWithTrimmedWhitespace: String? {
|
||||
|
||||
guard let s = currentString else {
|
||||
return nil
|
||||
}
|
||||
return s.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines)
|
||||
}
|
||||
|
||||
private var data: Data
|
||||
private var storingCharacters = false
|
||||
private var characters = Data()
|
||||
|
||||
public init(delegate: SAXHTMLParserDelegate, data: Data) {
|
||||
|
||||
self.delegate = delegate
|
||||
self.data = data
|
||||
}
|
||||
|
||||
public func parse() {
|
||||
|
||||
guard !data.isEmpty else {
|
||||
return
|
||||
}
|
||||
|
||||
data.withUnsafeBytes { bufferPointer in
|
||||
|
||||
guard let bytes = bufferPointer.bindMemory(to: CChar.self).baseAddress else {
|
||||
return
|
||||
}
|
||||
|
||||
let characterEncoding = xmlDetectCharEncoding(bytes, Int32(data.count))
|
||||
let context = htmlCreatePushParserCtxt(&saxHandlerStruct, Unmanaged.passUnretained(self).toOpaque(), nil, 0, nil, characterEncoding)
|
||||
htmlCtxtUseOptions(context, Int32(HTML_PARSE_RECOVER.rawValue | HTML_PARSE_NONET.rawValue | HTML_PARSE_COMPACT.rawValue | HTML_PARSE_NOERROR.rawValue | HTML_PARSE_NOWARNING.rawValue))
|
||||
|
||||
htmlParseChunk(context, bytes, Int32(data.count), 0)
|
||||
|
||||
htmlParseChunk(context, nil, 0, 1)
|
||||
htmlFreeParserCtxt(context)
|
||||
}
|
||||
}
|
||||
|
||||
/// Delegate can call from xmlStartElement. Characters will be available in xmlEndElement as currentCharacters property. Storing characters is stopped after each xmlEndElement.
|
||||
public func beginStoringCharacters() {
|
||||
|
||||
storingCharacters = true
|
||||
characters.count = 0
|
||||
}
|
||||
|
||||
public func endStoringCharacters() {
|
||||
|
||||
storingCharacters = false
|
||||
characters.count = 0
|
||||
}
|
||||
|
||||
public func attributesDictionary(_ attributes: UnsafePointer<XMLPointer?>?) -> StringDictionary? {
|
||||
|
||||
guard let attributes else {
|
||||
return nil
|
||||
}
|
||||
|
||||
var dictionary = [String: String]()
|
||||
var ix = 0
|
||||
var currentKey: String? = nil
|
||||
|
||||
while true {
|
||||
let oneAttribute = attributes[ix]
|
||||
ix += 1
|
||||
|
||||
if currentKey == nil && oneAttribute == nil {
|
||||
break
|
||||
}
|
||||
|
||||
if currentKey == nil {
|
||||
if let oneAttribute {
|
||||
currentKey = String(cString: oneAttribute)
|
||||
}
|
||||
} else {
|
||||
let value: String?
|
||||
if let oneAttribute {
|
||||
value = String(cString: oneAttribute)
|
||||
} else {
|
||||
value = nil
|
||||
}
|
||||
|
||||
dictionary[currentKey!] = value ?? ""
|
||||
currentKey = nil
|
||||
}
|
||||
}
|
||||
|
||||
return dictionary
|
||||
}
|
||||
}
|
||||
|
||||
private extension SAXHTMLParser {
|
||||
|
||||
func charactersFound(_ htmlCharacters: XMLPointer, count: Int) {
|
||||
|
||||
if storingCharacters {
|
||||
characters.append(htmlCharacters, count: count)
|
||||
}
|
||||
|
||||
delegate.saxHTMLParser(self, charactersFound: htmlCharacters, count: count)
|
||||
}
|
||||
|
||||
func startElement(_ name: XMLPointer, attributes: UnsafePointer<XMLPointer?>?) {
|
||||
|
||||
delegate.saxHTMLParser(self, startElement: name, attributes: attributes)
|
||||
}
|
||||
|
||||
func endElement(_ name: XMLPointer) {
|
||||
|
||||
delegate.saxHTMLParser(self, endElement: name)
|
||||
endStoringCharacters()
|
||||
}
|
||||
}
|
||||
|
||||
private func parser(from context: UnsafeMutableRawPointer) -> SAXHTMLParser {
|
||||
|
||||
Unmanaged<SAXHTMLParser>.fromOpaque(context).takeUnretainedValue()
|
||||
}
|
||||
|
||||
nonisolated(unsafe) private var saxHandlerStruct: xmlSAXHandler = {
|
||||
|
||||
var handler = htmlSAXHandler()
|
||||
|
||||
handler.characters = { (context: UnsafeMutableRawPointer?, ch: XMLPointer?, len: CInt) in
|
||||
|
||||
guard let context, let ch, len > 0 else {
|
||||
return
|
||||
}
|
||||
|
||||
let parser = parser(from: context)
|
||||
parser.charactersFound(ch, count: Int(len))
|
||||
}
|
||||
|
||||
handler.startElement = { (context: UnsafeMutableRawPointer?, name: XMLPointer?, attributes: UnsafeMutablePointer<XMLPointer?>?) in
|
||||
|
||||
guard let context, let name else {
|
||||
return
|
||||
}
|
||||
|
||||
let parser = parser(from: context)
|
||||
parser.startElement(name, attributes: attributes)
|
||||
}
|
||||
|
||||
handler.endElement = { (context: UnsafeMutableRawPointer?, name: XMLPointer?) in
|
||||
|
||||
guard let context, let name else {
|
||||
return
|
||||
}
|
||||
|
||||
let parser = parser(from: context)
|
||||
parser.endElement(name)
|
||||
}
|
||||
|
||||
return handler
|
||||
}()
|
||||
204
Modules/Parser/Sources/Parser/SAX/SAXParser.swift
Normal file
204
Modules/Parser/Sources/Parser/SAX/SAXParser.swift
Normal file
@@ -0,0 +1,204 @@
|
||||
//
|
||||
// SAXParser.swift.
|
||||
//
|
||||
//
|
||||
// Created by Brent Simmons on 8/12/24.
|
||||
//
|
||||
|
||||
import Foundation
|
||||
import FoundationExtras
|
||||
import libxml2
|
||||
|
||||
public typealias XMLPointer = UnsafePointer<xmlChar>
|
||||
|
||||
public protocol SAXParserDelegate {
|
||||
|
||||
func saxParser(_: SAXParser, xmlStartElement: XMLPointer, prefix: XMLPointer?, uri: XMLPointer?, namespaceCount: Int, namespaces: UnsafePointer<XMLPointer?>?, attributeCount: Int, attributesDefaultedCount: Int, attributes: UnsafePointer<XMLPointer?>?)
|
||||
|
||||
func saxParser(_: SAXParser, xmlEndElement: XMLPointer, prefix: XMLPointer?, uri: XMLPointer?)
|
||||
|
||||
func saxParser(_: SAXParser, xmlCharactersFound: XMLPointer, count: Int)
|
||||
}
|
||||
|
||||
public final class SAXParser {
|
||||
|
||||
fileprivate let delegate: SAXParserDelegate
|
||||
|
||||
public var currentCharacters: Data? { // UTF-8 encoded
|
||||
|
||||
guard storingCharacters else {
|
||||
return nil
|
||||
}
|
||||
return characters
|
||||
}
|
||||
|
||||
// Conveniences to get string version of currentCharacters
|
||||
|
||||
public var currentString: String? {
|
||||
|
||||
guard let d = currentCharacters, !d.isEmpty else {
|
||||
return nil
|
||||
}
|
||||
return String(data: d, encoding: .utf8)
|
||||
}
|
||||
|
||||
public var currentStringWithTrimmedWhitespace: String? {
|
||||
|
||||
guard let s = currentString else {
|
||||
return nil
|
||||
}
|
||||
return s.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines)
|
||||
}
|
||||
|
||||
private var data: Data
|
||||
private var storingCharacters = false
|
||||
private var characters = Data()
|
||||
|
||||
public init(delegate: SAXParserDelegate, data: Data) {
|
||||
|
||||
self.delegate = delegate
|
||||
self.data = data
|
||||
}
|
||||
|
||||
public func parse() {
|
||||
|
||||
guard !data.isEmpty else {
|
||||
return
|
||||
}
|
||||
|
||||
let context = xmlCreatePushParserCtxt(&saxHandlerStruct, Unmanaged.passUnretained(self).toOpaque(), nil, 0, nil)
|
||||
xmlCtxtUseOptions(context, Int32(XML_PARSE_RECOVER.rawValue | XML_PARSE_NOENT.rawValue))
|
||||
|
||||
data.withUnsafeBytes { bufferPointer in
|
||||
if let bytes = bufferPointer.bindMemory(to: CChar.self).baseAddress {
|
||||
xmlParseChunk(context, bytes, Int32(data.count), 0)
|
||||
}
|
||||
}
|
||||
|
||||
xmlParseChunk(context, nil, 0, 1)
|
||||
xmlFreeParserCtxt(context)
|
||||
}
|
||||
|
||||
/// Delegate can call from xmlStartElement. Characters will be available in xmlEndElement as currentCharacters property. Storing characters is stopped after each xmlEndElement.
|
||||
public func beginStoringCharacters() {
|
||||
|
||||
storingCharacters = true
|
||||
characters.count = 0
|
||||
}
|
||||
|
||||
public func endStoringCharacters() {
|
||||
|
||||
storingCharacters = false
|
||||
characters.count = 0
|
||||
}
|
||||
|
||||
public func attributesDictionary(_ attributes: UnsafePointer<XMLPointer?>?, attributeCount: Int) -> StringDictionary? {
|
||||
|
||||
guard attributeCount > 0, let attributes else {
|
||||
return nil
|
||||
}
|
||||
|
||||
var dictionary = [String: String]()
|
||||
|
||||
let fieldCount = 5
|
||||
var i = 0, j = 0
|
||||
while i < attributeCount {
|
||||
|
||||
guard let attribute = attributes[j] else {
|
||||
continue
|
||||
}
|
||||
let prefix = attributes[j + 1]
|
||||
var attributeName = String(cString: attribute)
|
||||
if let prefix {
|
||||
let attributePrefix = String(cString: prefix)
|
||||
attributeName = "\(attributePrefix):\(attributeName)"
|
||||
}
|
||||
|
||||
guard let valueStart = attributes[j + 3], let valueEnd = attributes[j + 4] else {
|
||||
continue
|
||||
}
|
||||
let valueCount = valueEnd - valueStart
|
||||
let value = String(bytes: UnsafeRawBufferPointer(start: valueStart, count: Int(valueCount)), encoding: .utf8)
|
||||
|
||||
if let value {
|
||||
dictionary[attributeName] = value
|
||||
}
|
||||
|
||||
i += 1
|
||||
j += fieldCount
|
||||
}
|
||||
|
||||
return dictionary
|
||||
}
|
||||
}
|
||||
|
||||
private extension SAXParser {
|
||||
|
||||
func charactersFound(_ xmlCharacters: XMLPointer, count: Int) {
|
||||
|
||||
if storingCharacters {
|
||||
characters.append(xmlCharacters, count: count)
|
||||
}
|
||||
|
||||
delegate.saxParser(self, xmlCharactersFound: xmlCharacters, count: count)
|
||||
}
|
||||
|
||||
func startElement(_ name: XMLPointer, prefix: XMLPointer?, uri: XMLPointer?, namespaceCount: Int, namespaces: UnsafePointer<XMLPointer?>?, attributeCount: Int, attributesDefaultedCount: Int, attributes: UnsafePointer<XMLPointer?>?) {
|
||||
|
||||
delegate.saxParser(self, xmlStartElement: name, prefix: prefix, uri: uri, namespaceCount: namespaceCount, namespaces: namespaces, attributeCount: attributeCount, attributesDefaultedCount: attributesDefaultedCount, attributes: attributes)
|
||||
}
|
||||
|
||||
func endElement(_ name: XMLPointer, prefix: XMLPointer?, uri: XMLPointer?) {
|
||||
|
||||
delegate.saxParser(self, xmlEndElement: name, prefix: prefix, uri: uri)
|
||||
endStoringCharacters()
|
||||
}
|
||||
}
|
||||
|
||||
private func startElement(_ context: UnsafeMutableRawPointer?, name: XMLPointer?, prefix: XMLPointer?, URI: XMLPointer?, nb_namespaces: CInt, namespaces: UnsafeMutablePointer<XMLPointer?>?, nb_attributes: CInt, nb_defaulted: CInt, attributes: UnsafeMutablePointer<XMLPointer?>?) {
|
||||
|
||||
guard let context, let name else {
|
||||
return
|
||||
}
|
||||
|
||||
let parser = parser(from: context)
|
||||
parser.startElement(name, prefix: prefix, uri: URI, namespaceCount: Int(nb_namespaces), namespaces: namespaces, attributeCount: Int(nb_attributes), attributesDefaultedCount: Int(nb_defaulted), attributes: attributes)
|
||||
}
|
||||
|
||||
private func endElement(_ context: UnsafeMutableRawPointer?, name: XMLPointer?, prefix: XMLPointer?, URI: XMLPointer?) {
|
||||
|
||||
guard let context, let name else {
|
||||
return
|
||||
}
|
||||
|
||||
let parser = parser(from: context)
|
||||
parser.endElement(name, prefix: prefix, uri: URI)
|
||||
}
|
||||
|
||||
private func charactersFound(_ context: UnsafeMutableRawPointer?, ch: XMLPointer?, len: CInt) {
|
||||
|
||||
guard let context, let ch, len > 0 else {
|
||||
return
|
||||
}
|
||||
|
||||
let parser = parser(from: context)
|
||||
parser.charactersFound(ch, count: Int(len))
|
||||
}
|
||||
|
||||
private func parser(from context: UnsafeMutableRawPointer) -> SAXParser {
|
||||
|
||||
Unmanaged<SAXParser>.fromOpaque(context).takeUnretainedValue()
|
||||
}
|
||||
|
||||
nonisolated(unsafe) private var saxHandlerStruct: xmlSAXHandler = {
|
||||
|
||||
var handler = xmlSAXHandler()
|
||||
|
||||
handler.characters = charactersFound
|
||||
handler.startElementNs = startElement
|
||||
handler.endElementNs = endElement
|
||||
handler.initialized = XML_SAX2_MAGIC
|
||||
|
||||
return handler
|
||||
}()
|
||||
|
||||
41
Modules/Parser/Sources/Parser/SAX/SAXUtilities.swift
Normal file
41
Modules/Parser/Sources/Parser/SAX/SAXUtilities.swift
Normal file
@@ -0,0 +1,41 @@
|
||||
//
|
||||
// SAXUtilities.swift
|
||||
//
|
||||
//
|
||||
// Created by Brent Simmons on 8/26/24.
|
||||
//
|
||||
|
||||
import Foundation
|
||||
import libxml2
|
||||
|
||||
public func SAXEqualTags(_ localName: XMLPointer, _ tag: ContiguousArray<Int8>) -> Bool {
|
||||
|
||||
return tag.withUnsafeBufferPointer { bufferPointer in
|
||||
|
||||
let tagCount = tag.count // includes 0 terminator
|
||||
|
||||
for i in 0..<tagCount - 1 {
|
||||
|
||||
let localNameCharacter = localName[i]
|
||||
if localNameCharacter == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
let tagCharacter = UInt8(tag[i])
|
||||
if localNameCharacter != tagCharacter {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// localName might actually be longer — make sure it’s the same length as tag.
|
||||
return localName[tagCount - 1] == 0
|
||||
}
|
||||
}
|
||||
|
||||
public extension String {
|
||||
|
||||
init?(xmlPointer: XMLPointer, count: Int? = nil) {
|
||||
let d = Data(bytes: xmlPointer, count: count ?? strlen(xmlPointer))
|
||||
self.init(data: d, encoding: .utf8)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user