Move modules to Modules folder.

This commit is contained in:
Brent Simmons
2025-01-06 21:13:56 -08:00
parent 430871c94a
commit 2933d9aca0
463 changed files with 2 additions and 20 deletions

View File

@@ -0,0 +1,595 @@
//
// DateParser.swift
//
//
// Created by Brent Simmons on 8/28/24.
//
import Foundation
public final class DateParser {
// MARK: - Public API
/// Parse W3C and pubDate dates used for feed parsing.
/// This is a fast alternative to system APIs
/// for parsing dates.
public static func date(data: Data) -> Date? {
let numberOfBytes = data.count
// Make sure its in reasonable range for a date string.
if numberOfBytes < 6 || numberOfBytes > 150 {
return nil
}
return data.withUnsafeBytes { bytes in
let buffer = bytes.bindMemory(to: UInt8.self)
if dateIsW3CDate(buffer, numberOfBytes) {
return parseW3CDate(buffer, numberOfBytes)
}
else if dateIsPubDate(buffer, numberOfBytes) {
return parsePubDate(buffer, numberOfBytes)
}
// Fallback, in case our detection fails.
return parseW3CDate(buffer, numberOfBytes)
}
}
public static func date(string: String) -> Date? {
guard let data = string.data(using: .utf8) else {
return nil
}
return date(data: data)
}
private typealias DateBuffer = UnsafeBufferPointer<UInt8>
// See http://en.wikipedia.org/wiki/List_of_time_zone_abbreviations for list
private static let timeZoneTable: [String: Int] = [
"GMT": timeZoneOffset(0, 0),
"UTC": timeZoneOffset(0, 0),
"PDT": timeZoneOffset(-7, 0),
"PST": timeZoneOffset(-8, 0),
"EST": timeZoneOffset(-5, 0),
"EDT": timeZoneOffset(-4, 0),
"MDT": timeZoneOffset(-6, 0),
"MST": timeZoneOffset(-7, 0),
"CST": timeZoneOffset(-6, 0),
"CDT": timeZoneOffset(-5, 0),
"ACT": timeZoneOffset(-8, 0),
"AFT": timeZoneOffset(4, 30),
"AMT": timeZoneOffset(4, 0),
"ART": timeZoneOffset(-3, 0),
"AST": timeZoneOffset(3, 0),
"AZT": timeZoneOffset(4, 0),
"BIT": timeZoneOffset(-12, 0),
"BDT": timeZoneOffset(8, 0),
"ACST": timeZoneOffset(9, 30),
"AEST": timeZoneOffset(10, 0),
"AKST": timeZoneOffset(-9, 0),
"AMST": timeZoneOffset(5, 0),
"AWST": timeZoneOffset(8, 0),
"AZOST": timeZoneOffset(-1, 0),
"BIOT": timeZoneOffset(6, 0),
"BRT": timeZoneOffset(-3, 0),
"BST": timeZoneOffset(6, 0),
"BTT": timeZoneOffset(6, 0),
"CAT": timeZoneOffset(2, 0),
"CCT": timeZoneOffset(6, 30),
"CET": timeZoneOffset(1, 0),
"CEST": timeZoneOffset(2, 0),
"CHAST": timeZoneOffset(12, 45),
"ChST": timeZoneOffset(10, 0),
"CIST": timeZoneOffset(-8, 0),
"CKT": timeZoneOffset(-10, 0),
"CLT": timeZoneOffset(-4, 0),
"CLST": timeZoneOffset(-3, 0),
"COT": timeZoneOffset(-5, 0),
"COST": timeZoneOffset(-4, 0),
"CVT": timeZoneOffset(-1, 0),
"CXT": timeZoneOffset(7, 0),
"EAST": timeZoneOffset(-6, 0),
"EAT": timeZoneOffset(3, 0),
"ECT": timeZoneOffset(-4, 0),
"EEST": timeZoneOffset(3, 0),
"EET": timeZoneOffset(2, 0),
"FJT": timeZoneOffset(12, 0),
"FKST": timeZoneOffset(-4, 0),
"GALT": timeZoneOffset(-6, 0),
"GET": timeZoneOffset(4, 0),
"GFT": timeZoneOffset(-3, 0),
"GILT": timeZoneOffset(7, 0),
"GIT": timeZoneOffset(-9, 0),
"GST": timeZoneOffset(-2, 0),
"GYT": timeZoneOffset(-4, 0),
"HAST": timeZoneOffset(-10, 0),
"HKT": timeZoneOffset(8, 0),
"HMT": timeZoneOffset(5, 0),
"IRKT": timeZoneOffset(8, 0),
"IRST": timeZoneOffset(3, 30),
"IST": timeZoneOffset(2, 0),
"JST": timeZoneOffset(9, 0),
"KRAT": timeZoneOffset(7, 0),
"KST": timeZoneOffset(9, 0),
"LHST": timeZoneOffset(10, 30),
"LINT": timeZoneOffset(14, 0),
"MAGT": timeZoneOffset(11, 0),
"MIT": timeZoneOffset(-9, 30),
"MSK": timeZoneOffset(3, 0),
"MUT": timeZoneOffset(4, 0),
"NDT": timeZoneOffset(-2, 30),
"NFT": timeZoneOffset(11, 30),
"NPT": timeZoneOffset(5, 45),
"NT": timeZoneOffset(-3, 30),
"OMST": timeZoneOffset(6, 0),
"PETT": timeZoneOffset(12, 0),
"PHOT": timeZoneOffset(13, 0),
"PKT": timeZoneOffset(5, 0),
"RET": timeZoneOffset(4, 0),
"SAMT": timeZoneOffset(4, 0),
"SAST": timeZoneOffset(2, 0),
"SBT": timeZoneOffset(11, 0),
"SCT": timeZoneOffset(4, 0),
"SLT": timeZoneOffset(5, 30),
"SST": timeZoneOffset(8, 0),
"TAHT": timeZoneOffset(-10, 0),
"THA": timeZoneOffset(7, 0),
"UYT": timeZoneOffset(-3, 0),
"UYST": timeZoneOffset(-2, 0),
"VET": timeZoneOffset(-4, 30),
"VLAT": timeZoneOffset(10, 0),
"WAT": timeZoneOffset(1, 0),
"WET": timeZoneOffset(0, 0),
"WEST": timeZoneOffset(1, 0),
"YAKT": timeZoneOffset(9, 0),
"YEKT": timeZoneOffset(5, 0)
]
}
// MARK: - Private
private extension DateParser {
struct DateCharacter {
static let space = Character(" ").asciiValue!
static let `return` = Character("\r").asciiValue!
static let newline = Character("\n").asciiValue!
static let tab = Character("\t").asciiValue!
static let hyphen = Character("-").asciiValue!
static let comma = Character(",").asciiValue!
static let dot = Character(".").asciiValue!
static let colon = Character(":").asciiValue!
static let plus = Character("+").asciiValue!
static let minus = Character("-").asciiValue!
static let A = Character("A").asciiValue!
static let a = Character("a").asciiValue!
static let D = Character("D").asciiValue!
static let d = Character("d").asciiValue!
static let F = Character("F").asciiValue!
static let f = Character("f").asciiValue!
static let J = Character("J").asciiValue!
static let j = Character("j").asciiValue!
static let M = Character("M").asciiValue!
static let m = Character("m").asciiValue!
static let N = Character("N").asciiValue!
static let n = Character("n").asciiValue!
static let O = Character("O").asciiValue!
static let o = Character("o").asciiValue!
static let S = Character("S").asciiValue!
static let s = Character("s").asciiValue!
static let U = Character("U").asciiValue!
static let u = Character("u").asciiValue!
static let Y = Character("Y").asciiValue!
static let y = Character("y").asciiValue!
static let Z = Character("Z").asciiValue!
static let z = Character("z").asciiValue!
}
enum Month: Int {
case January = 1,
February,
March,
April,
May,
June,
July,
August,
September,
October,
November,
December
}
// MARK: - Standard Formats
private static func dateIsW3CDate(_ bytes: DateBuffer, _ numberOfBytes: Int) -> Bool {
// Something like 2010-11-17T08:40:07-05:00
// But might be missing T character in the middle.
// Looks for four digits in a row followed by a -.
for i in 0..<numberOfBytes - 4 {
let ch = bytes[i]
// Skip whitespace.
if ch == DateCharacter.space || ch == DateCharacter.`return` || ch == DateCharacter.newline || ch == DateCharacter.tab {
continue
}
assert(i + 4 < numberOfBytes)
// First non-whitespace character must be the beginning of the year, as in `2010-`
return Bool(isDigit(ch)) && isDigit(bytes[i + 1]) && isDigit(bytes[i + 2]) && isDigit(bytes[i + 3]) && bytes[i + 4] == DateCharacter.hyphen
}
return false
}
private static func dateIsPubDate(_ bytes: DateBuffer, _ numberOfBytes: Int) -> Bool {
for ch in bytes {
if ch == DateCharacter.space || ch == DateCharacter.comma {
return true
}
}
return false
}
private static func parseW3CDate(_ bytes: DateBuffer, _ numberOfBytes: Int) -> Date? {
/*@"yyyy'-'MM'-'dd'T'HH':'mm':'ss"
@"yyyy-MM-dd'T'HH:mm:sszzz"
@"yyyy-MM-dd'T'HH:mm:ss'.'SSSzzz"
etc.*/
var finalIndex = 0
guard let year = nextNumericValue(bytes, numberOfBytes, 0, 4, &finalIndex) else {
return nil
}
guard let month = nextNumericValue(bytes, numberOfBytes, finalIndex + 1, 2, &finalIndex) else {
return nil
}
guard let day = nextNumericValue(bytes, numberOfBytes, finalIndex + 1, 2, &finalIndex) else {
return nil
}
let hour = nextNumericValue(bytes, numberOfBytes, finalIndex + 1, 2, &finalIndex) ?? 0
let minute = nextNumericValue(bytes, numberOfBytes, finalIndex + 1, 2, &finalIndex) ?? 0
let second = nextNumericValue(bytes, numberOfBytes, finalIndex + 1, 2, &finalIndex) ?? 0
var currentIndex = finalIndex + 1
let milliseconds = {
var ms = 0
let hasMilliseconds = (currentIndex < numberOfBytes) && (bytes[currentIndex] == DateCharacter.dot)
if hasMilliseconds {
ms = nextNumericValue(bytes, numberOfBytes, currentIndex, 3, &finalIndex) ?? 00
currentIndex = finalIndex + 1
}
// Ignore more than 3 digits of precision
while currentIndex < numberOfBytes && isDigit(bytes[currentIndex]) {
currentIndex += 1
}
return ms
}()
let timeZoneOffset = parsedTimeZoneOffset(bytes, numberOfBytes, currentIndex)
return dateWithYearMonthDayHourMinuteSecondAndtimeZoneOffset(year, month, day, hour, minute, second, milliseconds, timeZoneOffset)
}
private static func parsePubDate(_ bytes: DateBuffer, _ numberOfBytes: Int) -> Date? {
var finalIndex = 0
let day = nextNumericValue(bytes, numberOfBytes, 0, 2, &finalIndex) ?? 1
let month = nextMonthValue(bytes, numberOfBytes, finalIndex + 1, &finalIndex) ?? .January
guard let year = nextNumericValue(bytes, numberOfBytes, finalIndex + 1, 4, &finalIndex) else {
return nil
}
let hour = nextNumericValue(bytes, numberOfBytes, finalIndex + 1, 2, &finalIndex) ?? 0
let minute = nextNumericValue(bytes, numberOfBytes, finalIndex + 1, 2, &finalIndex) ?? 0
var currentIndex = finalIndex + 1
let second = {
var s = 0
let hasSeconds = (currentIndex < numberOfBytes) && (bytes[currentIndex] == DateCharacter.colon)
if hasSeconds {
s = nextNumericValue(bytes, numberOfBytes, currentIndex, 2, &finalIndex) ?? 0
}
return s
}()
currentIndex = finalIndex + 1
let timeZoneOffset = {
var offset = 0
let hasTimeZone = (currentIndex < numberOfBytes) && (bytes[currentIndex] == DateCharacter.space)
if hasTimeZone {
offset = parsedTimeZoneOffset(bytes, numberOfBytes, currentIndex)
}
return offset
}()
return dateWithYearMonthDayHourMinuteSecondAndtimeZoneOffset(year, month.rawValue, day, hour, minute, second, 0, timeZoneOffset)
}
// MARK: - Date Creation
static func dateWithYearMonthDayHourMinuteSecondAndtimeZoneOffset(_ year: Int, _ month: Int, _ day: Int, _ hour: Int, _ minute: Int, _ second: Int, _ milliseconds: Int, _ timeZoneOffset: Int) -> Date? {
var timeInfo = tm()
timeInfo.tm_sec = CInt(second)
timeInfo.tm_min = CInt(minute)
timeInfo.tm_hour = CInt(hour)
timeInfo.tm_mday = CInt(day)
timeInfo.tm_mon = CInt(month - 1) //It's 1-based coming in
timeInfo.tm_year = CInt(year - 1900) //see time.h -- it's years since 1900
timeInfo.tm_wday = -1
timeInfo.tm_yday = -1
timeInfo.tm_isdst = -1
timeInfo.tm_gmtoff = 0;
timeInfo.tm_zone = nil;
let rawTime = timegm(&timeInfo) - timeZoneOffset
if rawTime == time_t(UInt32.max) {
// NSCalendar is super-amazingly slow (which is partly why this parser exists),
// so this is used only when the date is far enough in the future
// (19 January 2038 03:14:08Z on 32-bit systems) that timegm fails.
// Hopefully by the time we consistently need dates that far in the future
// the performance of NSCalendar wont be an issue.
var dateComponents = DateComponents()
dateComponents.timeZone = TimeZone(secondsFromGMT: timeZoneOffset)
dateComponents.year = year
dateComponents.month = month
dateComponents.day = day
dateComponents.hour = hour
dateComponents.minute = minute
dateComponents.second = second
dateComponents.nanosecond = milliseconds * 1000000
return Calendar.autoupdatingCurrent.date(from: dateComponents)
}
var timeInterval = TimeInterval(rawTime)
if milliseconds > 0 {
timeInterval += TimeInterval(TimeInterval(milliseconds) / 1000.0)
}
return Date(timeIntervalSince1970: timeInterval)
}
// MARK: - Time Zones and Offsets
private static func parsedTimeZoneOffset(_ bytes: DateBuffer, _ numberOfBytes: Int, _ startingIndex: Int) -> Int {
var timeZoneCharacters: [UInt8] = [0, 0, 0, 0, 0, 0] // nil-terminated last character
var numberOfCharactersFound = 0
var hasAtLeastOneAlphaCharacter = false
for i in startingIndex..<numberOfBytes {
let ch = bytes[i]
if ch == DateCharacter.colon || ch == DateCharacter.space {
continue
}
let isAlphaCharacter = isAlpha(ch)
if isAlphaCharacter {
hasAtLeastOneAlphaCharacter = true
}
if isAlphaCharacter || isDigit(ch) || ch == DateCharacter.plus || ch == DateCharacter.minus {
numberOfCharactersFound += 1
timeZoneCharacters[numberOfCharactersFound - 1] = ch
}
if numberOfCharactersFound >= 5 {
break
}
}
if numberOfCharactersFound < 1 || timeZoneCharacters[0] == DateCharacter.Z || timeZoneCharacters[0] == DateCharacter.z {
return 0
}
if hasAtLeastOneAlphaCharacter {
return offsetInSecondsForTimeZoneAbbreviation(timeZoneCharacters) ?? 0
}
return offsetInSecondsForOffsetCharacters(timeZoneCharacters)
}
private static func offsetInSecondsForOffsetCharacters(_ timeZoneCharacters: [UInt8]) -> Int {
let isPlus = timeZoneCharacters[0] == DateCharacter.plus
var finalIndex = 0
let numberOfCharacters = strlen(timeZoneCharacters)
return timeZoneCharacters.withUnsafeBufferPointer { bytes in
let hours = nextNumericValue(bytes, numberOfCharacters, 0, 2, &finalIndex) ?? 0
let minutes = nextNumericValue(bytes, numberOfCharacters, finalIndex + 1, 2, &finalIndex) ?? 0
if hours == 0 && minutes == 0 {
return 0
}
var seconds = (hours * 60 * 60) + (minutes * 60)
if !isPlus {
seconds = 0 - seconds
}
return seconds
}
}
/// Returns offset in seconds.
static func timeZoneOffset(_ hours: Int, _ minutes: Int) -> Int {
if hours < 0 {
return (hours * 60 * 60) - (minutes * 60)
}
return (hours * 60 * 60) + (minutes * 60)
}
private static func offsetInSecondsForTimeZoneAbbreviation(_ abbreviation: [UInt8]) -> Int? {
var characters = [UInt8]()
for character in abbreviation {
if character == 0 {
break
}
characters.append(character)
}
let name = String(decoding: characters, as: UTF8.self)
return timeZoneTable[name]
}
// MARK: - Parser
private static func nextMonthValue(_ bytes: DateBuffer, _ numberOfBytes: Int, _ startingIndex: Int, _ finalIndex: inout Int) -> DateParser.Month? {
// Lots of short-circuits here. Not strict.
var numberOfAlphaCharactersFound = 0
var monthCharacters: [CChar] = [0, 0, 0]
for i in startingIndex..<numberOfBytes {
finalIndex = i
let ch = bytes[i]
let isAlphaCharacter = isAlpha(ch)
if !isAlphaCharacter {
if numberOfAlphaCharactersFound < 1 {
continue
}
if numberOfAlphaCharactersFound > 0 {
break
}
}
numberOfAlphaCharactersFound+=1
if numberOfAlphaCharactersFound == 1 {
if ch == DateCharacter.F || ch == DateCharacter.f {
return .February
}
if ch == DateCharacter.S || ch == DateCharacter.s {
return .September
}
if ch == DateCharacter.O || ch == DateCharacter.o {
return .October
}
if ch == DateCharacter.N || ch == DateCharacter.n {
return .November
}
if ch == DateCharacter.D || ch == DateCharacter.d {
return .December
}
}
monthCharacters[numberOfAlphaCharactersFound - 1] = CChar(ch)
if numberOfAlphaCharactersFound >= 3 {
break
}
}
if numberOfAlphaCharactersFound < 2 {
return nil
}
if monthCharacters[0] == DateCharacter.J || monthCharacters[0] == DateCharacter.j { // Jan, Jun, Jul
if monthCharacters[1] == DateCharacter.A || monthCharacters[1] == DateCharacter.a {
return .January
}
if monthCharacters[1] == DateCharacter.U || monthCharacters[1] == DateCharacter.u {
if monthCharacters[2] == DateCharacter.N || monthCharacters[2] == DateCharacter.n {
return .June
}
return .July
}
return .January
}
if monthCharacters[0] == DateCharacter.M || monthCharacters[0] == DateCharacter.m { // March, May
if monthCharacters[2] == DateCharacter.Y || monthCharacters[2] == DateCharacter.y {
return .May
}
return .March
}
if monthCharacters[0] == DateCharacter.A || monthCharacters[0] == DateCharacter.a { // April, August
if monthCharacters[1] == DateCharacter.U || monthCharacters[1] == DateCharacter.u {
return .August
}
return .April
}
return .January // Should never get here (but possibly do)
}
private static func nextNumericValue(_ bytes: DateBuffer, _ numberOfBytes: Int, _ startingIndex: Int, _ maximumNumberOfDigits: Int, _ finalIndex: inout Int) -> Int? {
// Maximum for the maximum is 4 (for time zone offsets and years)
assert(maximumNumberOfDigits > 0 && maximumNumberOfDigits <= 4)
var numberOfDigitsFound = 0
var digits = [0, 0, 0, 0]
for i in startingIndex..<numberOfBytes {
finalIndex = i
let ch = Int(bytes[i])
let isDigit = isDigit(ch)
if !isDigit && numberOfDigitsFound < 1 {
continue
}
if !isDigit && numberOfDigitsFound > 0 {
break
}
digits[numberOfDigitsFound] = ch - 48; // '0' is 48
numberOfDigitsFound+=1
if numberOfDigitsFound >= maximumNumberOfDigits {
break
}
}
if numberOfDigitsFound < 1 {
return nil
}
if numberOfDigitsFound == 1 {
return digits[0]
}
if numberOfDigitsFound == 2 {
return (digits[0] * 10) + digits[1]
}
if numberOfDigitsFound == 3 {
return (digits[0] * 100) + (digits[1] * 10) + digits[2]
}
return (digits[0] * 1000) + (digits[1] * 100) + (digits[2] * 10) + digits[3]
}
static func isDigit<T: BinaryInteger>(_ ch: T) -> Bool {
return isdigit(Int32(ch)) != 0
}
static func isAlpha<T: BinaryInteger>(_ ch: T) -> Bool {
return isalpha(Int32(ch)) != 0
}
}

View File

@@ -0,0 +1,73 @@
//
// FeedParser.swift
// Parser
//
// Created by Brent Simmons on 6/20/17.
// Copyright © 2017 Ranchero Software, LLC. All rights reserved.
//
import Foundation
// FeedParser handles RSS, Atom, JSON Feed, and RSS-in-JSON.
// You dont need to know the type of feed.
public struct FeedParser {
public static func canParse(_ data: Data) -> Bool {
let type = FeedType.feedType(data)
switch type {
case .jsonFeed, .rssInJSON, .rss, .atom:
return true
default:
return false
}
}
public static func parse(urlString: String, data: Data) throws -> ParsedFeed? {
let type = FeedType.feedType(data)
switch type {
case .jsonFeed:
return try JSONFeedParser.parse(urlString: urlString, data: data)
case .rssInJSON:
return try RSSInJSONParser.parse(urlString: urlString, data: data)
case .rss:
let feed = RSSParser.parsedFeed(urlString: urlString, data: data)
return RSSFeedTransformer.parsedFeed(with: feed, feedType: .rss)
case .atom:
let feed = AtomParser.parsedFeed(urlString: urlString, data: data)
return RSSFeedTransformer.parsedFeed(with: feed, feedType: .atom)
case .unknown, .notAFeed:
return nil
}
}
public static func parse(_ parserData: ParserData, _ completion: @Sendable @escaping (ParsedFeed?, Error?) -> Void) {
Task {
do {
let parsedFeed = try await parseAsync(urlString: parserData.url, data: parserData.data)
Task { @MainActor in
completion(parsedFeed, nil)
}
} catch {
Task { @MainActor in
completion(nil, error)
}
}
}
}
public static func parseAsync(urlString: String, data: Data) async throws -> ParsedFeed? {
try parse(urlString: urlString, data: data)
}
}

View File

@@ -0,0 +1,29 @@
//
// FeedParserError.swift
// Parser
//
// Created by Brent Simmons on 6/24/17.
// Copyright © 2017 Ranchero Software, LLC. All rights reserved.
//
import Foundation
public struct FeedParserError: Error, Sendable {
public enum FeedParserErrorType: Sendable {
case rssChannelNotFound
case rssItemsNotFound
case jsonFeedVersionNotFound
case jsonFeedItemsNotFound
case jsonFeedTitleNotFound
case invalidJSON
}
public let errorType: FeedParserErrorType
public init(_ errorType: FeedParserErrorType) {
self.errorType = errorType
}
}

View File

@@ -0,0 +1,148 @@
//
// FeedType.swift
// Parser
//
// Created by Brent Simmons on 6/20/17.
// Copyright © 2017 Ranchero Software, LLC. All rights reserved.
//
import Foundation
public enum FeedType: Sendable {
case rss
case atom
case jsonFeed
case rssInJSON
case unknown
case notAFeed
private static let minNumberOfBytesRequired = 128
static func feedType(_ data: Data, isPartialData: Bool = false) -> FeedType {
// Can call with partial data while still downloading, for instance.
// If theres not enough data, return .unknown. Ask again when theres more data.
// If its definitely not a feed, return .notAFeed.
let count = data.count
if count < minNumberOfBytesRequired {
return .unknown
}
return data.withUnsafeBytes { (pointer: UnsafeRawBufferPointer) in
guard let baseAddress = pointer.baseAddress else {
return .unknown
}
let cCharPointer = baseAddress.assumingMemoryBound(to: CChar.self)
if isProbablyJSON(cCharPointer, count) {
if isPartialData {
// Might not be able to detect a JSON Feed without all data.
// Dr. Drangs JSON Feed (see althis.json and allthis-partial.json in tests)
// has, at this writing, the JSON version element at the end of the feed,
// which is totally legal but it means not being able to detect
// that its a JSON Feed without all the data.
// So this returns .unknown instead of .notAFeed.
return .unknown
}
if isProbablyJSONFeed(cCharPointer, count) {
return .jsonFeed
}
if isProbablyRSSInJSON(cCharPointer, count) {
return .rssInJSON
}
}
if isProbablyRSS(cCharPointer, count) {
return .rss
}
if isProbablyAtom(cCharPointer, count) {
return .atom
}
return .notAFeed
}
}
}
private extension FeedType {
static func isProbablyRSS(_ bytes: UnsafePointer<CChar>, _ count: Int) -> Bool {
if didFindString("<rss", bytes, count) || didFindString("<rdf:RDF", bytes, count) {
return true
}
return didFindString("<channel>", bytes, count) && didFindString("<pubDate>", bytes, count)
}
static func isProbablyAtom(_ bytes: UnsafePointer<CChar>, _ count: Int) -> Bool {
didFindString("<feed", bytes, count)
}
static func isProbablyJSON(_ bytes: UnsafePointer<CChar>, _ count: Int) -> Bool {
bytesStartWithStringIgnoringWhitespace("{", bytes, count)
}
static func isProbablyJSONFeed(_ bytes: UnsafePointer<CChar>, _ count: Int) -> Bool {
// Assumes already called `isProbablyJSON` and it returned true.
didFindString("://jsonfeed.org/version/", bytes, count) || didFindString(":\\/\\/jsonfeed.org\\/version\\/", bytes, count)
}
static func isProbablyRSSInJSON(_ bytes: UnsafePointer<CChar>, _ count: Int) -> Bool {
// Assumes already called `isProbablyJSON` and it returned true.
didFindString("rss", bytes, count) && didFindString("channel", bytes, count) && didFindString("item", bytes, count)
}
static func didFindString(_ string: UnsafePointer<CChar>, _ bytes: UnsafePointer<CChar>, _ numberOfBytes: Int) -> Bool {
let foundString = strnstr(bytes, string, numberOfBytes)
return foundString != nil
}
struct Whitespace {
static let space = Character(" ").asciiValue!
static let `return` = Character("\r").asciiValue!
static let newline = Character("\n").asciiValue!
static let tab = Character("\t").asciiValue!
}
static func bytesStartWithStringIgnoringWhitespace(_ string: UnsafePointer<CChar>, _ bytes: UnsafePointer<CChar>, _ numberOfBytes: Int) -> Bool {
var i = 0
while i < numberOfBytes {
let ch = bytes[i]
if ch == Whitespace.space || ch == Whitespace.return || ch == Whitespace.newline || ch == Whitespace.tab {
i += 1
continue
}
if ch == string[0] {
if let found = strnstr(bytes, string, numberOfBytes) {
return found == bytes + i
}
}
// Allow for a BOM of up to four bytes (assuming BOM is only at the start)
if i < 4 {
i += 1
continue
}
break
}
return false
}
}

View File

@@ -0,0 +1,247 @@
//
// JSONFeedParser.swift
// Parser
//
// Created by Brent Simmons on 6/25/17.
// Copyright © 2017 Ranchero Software, LLC. All rights reserved.
//
import Foundation
// See https://jsonfeed.org/version/1.1
public struct JSONFeedParser {
struct Key {
static let version = "version"
static let items = "items"
static let title = "title"
static let homePageURL = "home_page_url"
static let feedURL = "feed_url"
static let feedDescription = "description"
static let nextURL = "next_url"
static let icon = "icon"
static let favicon = "favicon"
static let expired = "expired"
static let author = "author"
static let authors = "authors"
static let name = "name"
static let url = "url"
static let avatar = "avatar"
static let hubs = "hubs"
static let type = "type"
static let contentHTML = "content_html"
static let contentText = "content_text"
static let externalURL = "external_url"
static let summary = "summary"
static let image = "image"
static let bannerImage = "banner_image"
static let datePublished = "date_published"
static let dateModified = "date_modified"
static let tags = "tags"
static let uniqueID = "id"
static let attachments = "attachments"
static let mimeType = "mime_type"
static let sizeInBytes = "size_in_bytes"
static let durationInSeconds = "duration_in_seconds"
static let language = "language"
}
static let jsonFeedVersionMarker = "://jsonfeed.org/version/" // Allow for the mistake of not getting the scheme exactly correct.
public static func parse(urlString: String, data: Data) throws -> ParsedFeed? {
guard let d = JSONUtilities.dictionary(with: data) else {
throw FeedParserError(.invalidJSON)
}
guard let version = d[Key.version] as? String, let _ = version.range(of: JSONFeedParser.jsonFeedVersionMarker) else {
throw FeedParserError(.jsonFeedVersionNotFound)
}
guard let itemsArray = d[Key.items] as? JSONArray else {
throw FeedParserError(.jsonFeedItemsNotFound)
}
guard let title = d[Key.title] as? String else {
throw FeedParserError(.jsonFeedTitleNotFound)
}
let authors = parseAuthors(d)
let homePageURL = d[Key.homePageURL] as? String
let feedURL = d[Key.feedURL] as? String ?? urlString
let feedDescription = d[Key.feedDescription] as? String
let nextURL = d[Key.nextURL] as? String
let iconURL = d[Key.icon] as? String
let faviconURL = d[Key.favicon] as? String
let expired = d[Key.expired] as? Bool ?? false
let hubs = parseHubs(d)
let language = d[Key.language] as? String
let items = parseItems(itemsArray, urlString)
return ParsedFeed(type: .jsonFeed, title: title, homePageURL: homePageURL, feedURL: feedURL, language: language, feedDescription: feedDescription, nextURL: nextURL, iconURL: iconURL, faviconURL: faviconURL, authors: authors, expired: expired, hubs: hubs, items: items)
}
}
private extension JSONFeedParser {
static func parseAuthors(_ dictionary: JSONDictionary) -> Set<ParsedAuthor>? {
if let authorsArray = dictionary[Key.authors] as? JSONArray {
var authors = Set<ParsedAuthor>()
for author in authorsArray {
if let parsedAuthor = parseAuthor(author) {
authors.insert(parsedAuthor)
}
}
return authors
}
guard let authorDictionary = dictionary[Key.author] as? JSONDictionary,
let parsedAuthor = parseAuthor(authorDictionary) else {
return nil
}
return Set([parsedAuthor])
}
static func parseAuthor(_ dictionary: JSONDictionary) -> ParsedAuthor? {
let name = dictionary[Key.name] as? String
let url = dictionary[Key.url] as? String
let avatar = dictionary[Key.avatar] as? String
if name == nil && url == nil && avatar == nil {
return nil
}
return ParsedAuthor(name: name, url: url, avatarURL: avatar, emailAddress: nil)
}
static func parseHubs(_ dictionary: JSONDictionary) -> Set<ParsedHub>? {
guard let hubsArray = dictionary[Key.hubs] as? JSONArray else {
return nil
}
let hubs = hubsArray.compactMap { (hubDictionary) -> ParsedHub? in
guard let hubURL = hubDictionary[Key.url] as? String, let hubType = hubDictionary[Key.type] as? String else {
return nil
}
return ParsedHub(type: hubType, url: hubURL)
}
return hubs.isEmpty ? nil : Set(hubs)
}
static func parseItems(_ itemsArray: JSONArray, _ feedURL: String) -> Set<ParsedItem> {
return Set(itemsArray.compactMap { (oneItemDictionary) -> ParsedItem? in
return parseItem(oneItemDictionary, feedURL)
})
}
static func parseItem(_ itemDictionary: JSONDictionary, _ feedURL: String) -> ParsedItem? {
guard let uniqueID = parseUniqueID(itemDictionary) else {
return nil
}
let contentHTML = itemDictionary[Key.contentHTML] as? String
let contentText = itemDictionary[Key.contentText] as? String
if contentHTML == nil && contentText == nil {
return nil
}
let url = itemDictionary[Key.url] as? String
let externalURL = itemDictionary[Key.externalURL] as? String
let title = parseTitle(itemDictionary, feedURL)
let language = itemDictionary[Key.language] as? String
let summary = itemDictionary[Key.summary] as? String
let imageURL = itemDictionary[Key.image] as? String
let bannerImageURL = itemDictionary[Key.bannerImage] as? String
let datePublished = parseDate(itemDictionary[Key.datePublished] as? String)
let dateModified = parseDate(itemDictionary[Key.dateModified] as? String)
let authors = parseAuthors(itemDictionary)
var tags: Set<String>? = nil
if let tagsArray = itemDictionary[Key.tags] as? [String] {
tags = Set(tagsArray)
}
let attachments = parseAttachments(itemDictionary)
return ParsedItem(syncServiceID: nil, uniqueID: uniqueID, feedURL: feedURL, url: url, externalURL: externalURL, title: title, language: language, contentHTML: contentHTML, contentText: contentText, summary: summary, imageURL: imageURL, bannerImageURL: bannerImageURL, datePublished: datePublished, dateModified: dateModified, authors: authors, tags: tags, attachments: attachments)
}
static func parseTitle(_ itemDictionary: JSONDictionary, _ feedURL: String) -> String? {
guard let title = itemDictionary[Key.title] as? String else {
return nil
}
if isSpecialCaseTitleWithEntitiesFeed(feedURL) {
return HTMLEntityDecoder.decodedString(title)
}
return title
}
static func isSpecialCaseTitleWithEntitiesFeed(_ feedURL: String) -> Bool {
// As of 16 Feb. 2018, Kottkes and Heers feeds includes HTML entities in the title elements.
// If we find more feeds like this, well add them here. If these feeds get fixed, well remove them.
let lowerFeedURL = feedURL.lowercased()
let matchStrings = ["kottke.org", "pxlnv.com", "macstories.net", "macobserver.com"]
for matchString in matchStrings {
if lowerFeedURL.contains(matchString) {
return true
}
}
return false
}
static func parseUniqueID(_ itemDictionary: JSONDictionary) -> String? {
if let uniqueID = itemDictionary[Key.uniqueID] as? String {
return uniqueID // Spec says it must be a string
}
// Version 1 spec also says that if its a number, even though thats incorrect, it should be coerced to a string.
if let uniqueID = itemDictionary[Key.uniqueID] as? Int {
return "\(uniqueID)"
}
if let uniqueID = itemDictionary[Key.uniqueID] as? Double {
return "\(uniqueID)"
}
return nil
}
static func parseDate(_ dateString: String?) -> Date? {
guard let dateString = dateString, !dateString.isEmpty else {
return nil
}
return DateParser.date(string: dateString)
}
static func parseAttachments(_ itemDictionary: JSONDictionary) -> Set<ParsedAttachment>? {
guard let attachmentsArray = itemDictionary[Key.attachments] as? JSONArray else {
return nil
}
return Set(attachmentsArray.compactMap { parseAttachment($0) })
}
static func parseAttachment(_ attachmentObject: JSONDictionary) -> ParsedAttachment? {
guard let url = attachmentObject[Key.url] as? String else {
return nil
}
guard let mimeType = attachmentObject[Key.mimeType] as? String else {
return nil
}
let title = attachmentObject[Key.title] as? String
let sizeInBytes = attachmentObject[Key.sizeInBytes] as? Int
let durationInSeconds = attachmentObject[Key.durationInSeconds] as? Int
return ParsedAttachment(url: url, mimeType: mimeType, title: title, sizeInBytes: sizeInBytes, durationInSeconds: durationInSeconds)
}
}

View File

@@ -0,0 +1,182 @@
//
// RSSInJSONParser.swift
// Parser
//
// Created by Brent Simmons on 6/24/17.
// Copyright © 2017 Ranchero Software, LLC. All rights reserved.
//
import Foundation
import RSCore
// See https://github.com/scripting/Scripting-News/blob/master/rss-in-json/README.md
// Also: http://cyber.harvard.edu/rss/rss.html
public struct RSSInJSONParser {
public static func parse(urlString: String, data: Data) throws -> ParsedFeed? {
do {
guard let parsedObject = try JSONSerialization.jsonObject(with: data) as? JSONDictionary else {
throw FeedParserError(.invalidJSON)
}
guard let rssObject = parsedObject["rss"] as? JSONDictionary else {
throw FeedParserError(.rssChannelNotFound)
}
guard let channelObject = rssObject["channel"] as? JSONDictionary else {
throw FeedParserError(.rssChannelNotFound)
}
// Id bet money that in practice the items array wont always appear correctly inside the channel object.
// Id also bet that sometimes it gets called "items" instead of "item".
var itemsObject = channelObject["item"] as? JSONArray
if itemsObject == nil {
itemsObject = parsedObject["item"] as? JSONArray
}
if itemsObject == nil {
itemsObject = channelObject["items"] as? JSONArray
}
if itemsObject == nil {
itemsObject = parsedObject["items"] as? JSONArray
}
if itemsObject == nil {
throw FeedParserError(.rssItemsNotFound)
}
let title = channelObject["title"] as? String
let homePageURL = channelObject["link"] as? String
let feedURL = urlString
let feedDescription = channelObject["description"] as? String
let feedLanguage = channelObject["language"] as? String
let items = parseItems(itemsObject!, urlString)
return ParsedFeed(type: .rssInJSON, title: title, homePageURL: homePageURL, feedURL: feedURL, language: feedLanguage, feedDescription: feedDescription, nextURL: nil, iconURL: nil, faviconURL: nil, authors: nil, expired: false, hubs: nil, items: items)
}
catch { throw error }
}
}
private extension RSSInJSONParser {
static func parseItems(_ itemsObject: JSONArray, _ feedURL: String) -> Set<ParsedItem> {
return Set(itemsObject.compactMap{ (oneItemDictionary) -> ParsedItem? in
return parsedItemWithDictionary(oneItemDictionary, feedURL)
})
}
static func parsedItemWithDictionary(_ itemDictionary: JSONDictionary, _ feedURL: String) -> ParsedItem? {
let externalURL = itemDictionary["link"] as? String
let title = itemDictionary["title"] as? String
var contentHTML = itemDictionary["description"] as? String
var contentText: String? = nil
if contentHTML != nil && !(contentHTML!.contains("<")) {
contentText = contentHTML
contentHTML = nil
}
if contentHTML == nil && contentText == nil && title == nil {
return nil
}
var datePublished: Date? = nil
if let datePublishedString = itemDictionary["pubDate"] as? String {
datePublished = DateParser.date(string: datePublishedString)
}
let authors = parseAuthors(itemDictionary)
let tags = parseTags(itemDictionary)
let attachments = parseAttachments(itemDictionary)
var uniqueID: String? = itemDictionary["guid"] as? String
if uniqueID == nil {
// Calculate a uniqueID based on a combination of non-empty elements. Then hash the result.
// Items should have guids. When they don't, re-runs are very likely
// because there's no other 100% reliable way to determine identity.
// This calculated uniqueID is valid only for this particular feed. (Just like ids in JSON Feed.)
var s = ""
if let datePublished = datePublished {
s += "\(datePublished.timeIntervalSince1970)"
}
if let title = title {
s += title
}
if let externalURL = externalURL {
s += externalURL
}
if let authorEmailAddress = authors?.first?.emailAddress {
s += authorEmailAddress
}
if let oneAttachmentURL = attachments?.first?.url {
s += oneAttachmentURL
}
if s.isEmpty {
// Sheesh. Tough case.
if let _ = contentHTML {
s = contentHTML!
}
if let _ = contentText {
s = contentText!
}
}
uniqueID = s.md5String
}
if let uniqueID = uniqueID {
return ParsedItem(syncServiceID: nil, uniqueID: uniqueID, feedURL: feedURL, url: nil, externalURL: externalURL, title: title, language: nil, contentHTML: contentHTML, contentText: contentText, summary: nil, imageURL: nil, bannerImageURL: nil, datePublished: datePublished, dateModified: nil, authors: authors, tags: tags, attachments: attachments)
}
return nil
}
static func parseAuthors(_ itemDictionary: JSONDictionary) -> Set<ParsedAuthor>? {
guard let authorEmailAddress = itemDictionary["author"] as? String else {
return nil
}
let parsedAuthor = ParsedAuthor(name: nil, url: nil, avatarURL: nil, emailAddress: authorEmailAddress)
return Set([parsedAuthor])
}
static func parseTags(_ itemDictionary: JSONDictionary) -> Set<String>? {
if let categoryObject = itemDictionary["category"] as? JSONDictionary {
if let oneTag = categoryObject["#value"] as? String {
return Set([oneTag])
}
return nil
}
else if let categoryArray = itemDictionary["category"] as? JSONArray {
return Set(categoryArray.compactMap{ $0["#value"] as? String })
}
return nil
}
static func parseAttachments(_ itemDictionary: JSONDictionary) -> Set<ParsedAttachment>? {
guard let enclosureObject = itemDictionary["enclosure"] as? JSONDictionary else {
return nil
}
guard let attachmentURL = enclosureObject["url"] as? String else {
return nil
}
var attachmentSize = enclosureObject["length"] as? Int
if attachmentSize == nil {
if let attachmentSizeString = enclosureObject["length"] as? String {
attachmentSize = (attachmentSizeString as NSString).integerValue
}
}
let type = enclosureObject["type"] as? String
if let attachment = ParsedAttachment(url: attachmentURL, mimeType: type, title: nil, sizeInBytes: attachmentSize, durationInSeconds: nil) {
return Set([attachment])
}
return nil
}
}

View File

@@ -0,0 +1,42 @@
//
// ParsedAttachment.swift
// Parser
//
// Created by Brent Simmons on 6/20/17.
// Copyright © 2017 Ranchero Software, LLC. All rights reserved.
//
import Foundation
public final class ParsedAttachment: Hashable, Sendable {
public let url: String
public let mimeType: String?
public let title: String?
public let sizeInBytes: Int?
public let durationInSeconds: Int?
public init?(url: String, mimeType: String?, title: String?, sizeInBytes: Int?, durationInSeconds: Int?) {
if url.isEmpty {
return nil
}
self.url = url
self.mimeType = mimeType
self.title = title
self.sizeInBytes = sizeInBytes
self.durationInSeconds = durationInSeconds
}
// MARK: - Hashable
public func hash(into hasher: inout Hasher) {
hasher.combine(url)
}
// MARK: - Equatable
public static func ==(lhs: ParsedAttachment, rhs: ParsedAttachment) -> Bool {
lhs.url == rhs.url && lhs.mimeType == rhs.mimeType && lhs.title == rhs.title && lhs.sizeInBytes == rhs.sizeInBytes && lhs.durationInSeconds == rhs.durationInSeconds
}
}

View File

@@ -0,0 +1,63 @@
//
// ParsedAuthor.swift
// Parser
//
// Created by Brent Simmons on 6/20/17.
// Copyright © 2017 Ranchero Software, LLC. All rights reserved.
//
import Foundation
public final class ParsedAuthor: Hashable, Codable, Sendable {
public let name: String?
public let url: String?
public let avatarURL: String?
public let emailAddress: String?
public init(name: String?, url: String?, avatarURL: String?, emailAddress: String?) {
self.name = name
self.url = url
self.avatarURL = avatarURL
self.emailAddress = emailAddress
}
/// Use when the actual property is unknown. Guess based on contents of the string. (This is common with RSS.)
convenience init(singleString: String) {
if singleString.contains("@") {
self.init(name: nil, url: nil, avatarURL: nil, emailAddress: singleString)
} else if singleString.lowercased().hasPrefix("http") {
self.init(name: nil, url: singleString, avatarURL: nil, emailAddress: nil)
} else {
self.init(name: singleString, url: nil, avatarURL: nil, emailAddress: nil)
}
}
// MARK: - Hashable
public func hash(into hasher: inout Hasher) {
if let name {
hasher.combine(name)
}
else if let url {
hasher.combine(url)
}
else if let emailAddress {
hasher.combine(emailAddress)
}
else if let avatarURL{
hasher.combine(avatarURL)
}
else {
hasher.combine("")
}
}
// MARK: - Equatable
public static func ==(lhs: ParsedAuthor, rhs: ParsedAuthor) -> Bool {
lhs.name == rhs.name && lhs.url == rhs.url && lhs.avatarURL == rhs.avatarURL && lhs.emailAddress == rhs.emailAddress
}
}

View File

@@ -0,0 +1,42 @@
//
// ParsedFeed.swift
// Parser
//
// Created by Brent Simmons on 6/20/17.
// Copyright © 2017 Ranchero Software, LLC. All rights reserved.
//
import Foundation
public final class ParsedFeed: Sendable {
public let type: FeedType
public let title: String?
public let homePageURL: String?
public let feedURL: String?
public let language: String?
public let feedDescription: String?
public let nextURL: String?
public let iconURL: String?
public let faviconURL: String?
public let authors: Set<ParsedAuthor>?
public let expired: Bool
public let hubs: Set<ParsedHub>?
public let items: Set<ParsedItem>
public init(type: FeedType, title: String?, homePageURL: String?, feedURL: String?, language: String?, feedDescription: String?, nextURL: String?, iconURL: String?, faviconURL: String?, authors: Set<ParsedAuthor>?, expired: Bool, hubs: Set<ParsedHub>?, items: Set<ParsedItem>) {
self.type = type
self.title = title
self.homePageURL = homePageURL?.nilIfEmptyOrWhitespace
self.feedURL = feedURL
self.language = language
self.feedDescription = feedDescription
self.nextURL = nextURL
self.iconURL = iconURL
self.faviconURL = faviconURL
self.authors = authors
self.expired = expired
self.hubs = hubs
self.items = items
}
}

View File

@@ -0,0 +1,33 @@
//
// ParsedHub.swift
// Parser
//
// Created by Brent Simmons on 6/20/17.
// Copyright © 2017 Ranchero Software, LLC. All rights reserved.
//
import Foundation
public final class ParsedHub: Hashable, Sendable {
public let type: String
public let url: String
init(type: String, url: String) {
self.type = type
self.url = url
}
// MARK: - Hashable
public func hash(into hasher: inout Hasher) {
hasher.combine(type)
hasher.combine(url)
}
// MARK: - Equatable
public static func ==(lhs: ParsedHub, rhs: ParsedHub) -> Bool {
lhs.type == rhs.type && lhs.url == rhs.url
}
}

View File

@@ -0,0 +1,72 @@
//
// ParsedItem.swift
// Parser
//
// Created by Brent Simmons on 6/20/17.
// Copyright © 2017 Ranchero Software, LLC. All rights reserved.
//
import Foundation
public final class ParsedItem: Hashable, Sendable {
public let syncServiceID: String? //Nil when not syncing
public let uniqueID: String //RSS guid, for instance; may be calculated
public let feedURL: String
public let url: String?
public let externalURL: String?
public let title: String?
public let language: String?
public let contentHTML: String?
public let contentText: String?
public let summary: String?
public let imageURL: String?
public let bannerImageURL: String?
public let datePublished: Date?
public let dateModified: Date?
public let authors: Set<ParsedAuthor>?
public let tags: Set<String>?
public let attachments: Set<ParsedAttachment>?
public init(syncServiceID: String?, uniqueID: String, feedURL: String, url: String?, externalURL: String?, title: String?,
language: String?, contentHTML: String?, contentText: String?, summary: String?, imageURL: String?,
bannerImageURL: String?,datePublished: Date?, dateModified: Date?, authors: Set<ParsedAuthor>?,
tags: Set<String>?, attachments: Set<ParsedAttachment>?) {
self.syncServiceID = syncServiceID
self.uniqueID = uniqueID
self.feedURL = feedURL
self.url = url
self.externalURL = externalURL
self.title = title
self.language = language
self.contentHTML = contentHTML
self.contentText = contentText
self.summary = summary
self.imageURL = imageURL
self.bannerImageURL = bannerImageURL
self.datePublished = datePublished
self.dateModified = dateModified
self.authors = authors
self.tags = tags
self.attachments = attachments
}
// MARK: - Hashable
public func hash(into hasher: inout Hasher) {
if let syncServiceID = syncServiceID {
hasher.combine(syncServiceID)
}
else {
hasher.combine(uniqueID)
hasher.combine(feedURL)
}
}
public static func ==(lhs: ParsedItem, rhs: ParsedItem) -> Bool {
lhs.syncServiceID == rhs.syncServiceID && lhs.uniqueID == rhs.uniqueID && lhs.feedURL == rhs.feedURL && lhs.url == rhs.url && lhs.externalURL == rhs.externalURL && lhs.title == rhs.title && lhs.language == rhs.language && lhs.contentHTML == rhs.contentHTML && lhs.contentText == rhs.contentText && lhs.summary == rhs.summary && lhs.imageURL == rhs.imageURL && lhs.bannerImageURL == rhs.bannerImageURL && lhs.datePublished == rhs.datePublished && lhs.dateModified == rhs.dateModified && lhs.authors == rhs.authors && lhs.tags == rhs.tags && lhs.attachments == rhs.attachments
}
}

View File

@@ -0,0 +1,454 @@
//
// AtomParser.swift
// Parser
//
// Created by Brent Simmons on 6/25/17.
// Copyright © 2017 Ranchero Software, LLC. All rights reserved.
//
import Foundation
import RSCore
final class AtomParser {
private var feedURL: String
private let data: Data
private let feed: RSSFeed
private var articles = [RSSArticle]()
private var currentArticle: RSSArticle? {
articles.last
}
private var attributesStack = [StringDictionary]()
private var currentAttributes: StringDictionary? {
attributesStack.last
}
private var parsingXHTML = false
private var xhtmlString: String?
private var currentAuthor: RSSAuthor?
private var parsingAuthor = false
private var parsingArticle = false
private var parsingSource = false
private var endFeedFound = false
static func parsedFeed(urlString: String, data: Data) -> RSSFeed {
let parser = AtomParser(urlString: urlString, data: data)
parser.parse()
return parser.feed
}
init(urlString: String, data: Data) {
self.feedURL = urlString
self.data = data
self.feed = RSSFeed(urlString: urlString)
}
}
private extension AtomParser {
func parse() {
let saxParser = SAXParser(delegate: self, data: data)
saxParser.parse()
feed.articles = articles
}
private struct XMLName {
static let entry = "entry".utf8CString
static let content = "content".utf8CString
static let summary = "summary".utf8CString
static let link = "link".utf8CString
static let feed = "feed".utf8CString
static let source = "source".utf8CString
static let author = "author".utf8CString
static let name = "name".utf8CString
static let email = "email".utf8CString
static let uri = "uri".utf8CString
static let title = "title".utf8CString
static let id = "id".utf8CString
static let published = "published".utf8CString
static let updated = "updated".utf8CString
static let issued = "issued".utf8CString
static let modified = "modified".utf8CString
}
private struct XMLString {
static let rel = "rel"
static let alternate = "alternate"
static let related = "related"
static let enclosure = "enclosure"
static let href = "href"
static let title = "title"
static let type = "type"
static let length = "length"
static let xmlLang = "xml:lang"
}
func currentString(_ saxParser: SAXParser) -> String? {
saxParser.currentStringWithTrimmedWhitespace
}
func currentDate(_ saxParser: SAXParser) -> Date? {
guard let data = saxParser.currentCharacters else {
assertionFailure("Unexpected nil saxParser.currentCharacters in AtomParser.currentDate")
return nil
}
return DateParser.date(data: data)
}
func addFeedTitle(_ saxParser: SAXParser) {
guard feed.title == nil else {
return
}
if let title = currentString(saxParser), !title.isEmpty {
feed.title = title
}
}
func addFeedLink() {
guard feed.link == nil, let currentAttributes else {
return
}
guard let link = currentAttributes[XMLString.href] else {
return
}
let isRelated: Bool = {
if let related = currentAttributes[XMLString.rel], related == XMLString.alternate { // rel="alternate"
return true
}
return currentAttributes.count == 1 // Example: <link href="https://www.allenpike.com/"/>  no rel or anything
}()
if isRelated {
feed.link = link
}
}
func addFeedLanguage() {
guard feed.language == nil, let currentAttributes else {
return
}
feed.language = currentAttributes[XMLString.xmlLang]
}
func addArticle() {
let article = RSSArticle(feedURL)
articles.append(article)
}
func addArticleElement(_ saxParser: SAXParser, _ localName: XMLPointer, _ prefix: XMLPointer?) {
guard prefix == nil else {
return
}
guard let currentArticle else {
assertionFailure("currentArticle must not be nil in AtomParser.addArticleElement")
return
}
if SAXEqualTags(localName, XMLName.id) {
currentArticle.guid = currentString(saxParser)
}
else if SAXEqualTags(localName, XMLName.title) {
currentArticle.title = currentString(saxParser)
}
else if SAXEqualTags(localName, XMLName.content) {
addContent(saxParser, currentArticle)
}
else if SAXEqualTags(localName, XMLName.summary) {
addSummary(saxParser, currentArticle)
}
else if SAXEqualTags(localName, XMLName.link) {
addLink(currentArticle)
}
else if SAXEqualTags(localName, XMLName.published) {
currentArticle.datePublished = currentDate(saxParser)
}
else if SAXEqualTags(localName, XMLName.updated) {
currentArticle.dateModified = currentDate(saxParser)
}
// Atom 0.3 dates
else if SAXEqualTags(localName, XMLName.issued) {
if currentArticle.datePublished == nil {
currentArticle.datePublished = currentDate(saxParser)
}
}
else if SAXEqualTags(localName, XMLName.modified) {
if currentArticle.dateModified == nil {
currentArticle.dateModified = currentDate(saxParser)
}
}
}
func addContent(_ saxParser: SAXParser, _ article: RSSArticle) {
article.body = currentString(saxParser)
}
func addSummary(_ saxParser: SAXParser, _ article: RSSArticle) {
guard article.body == nil else {
return
}
article.body = currentString(saxParser)
}
func addLink(_ article: RSSArticle) {
guard let attributes = currentAttributes else {
return
}
guard let urlString = attributes[XMLString.href], !urlString.isEmpty else {
return
}
var rel = attributes[XMLString.rel]
if rel?.isEmpty ?? true {
rel = XMLString.alternate
}
if rel == XMLString.related {
if article.link == nil {
article.link = urlString
}
}
else if rel == XMLString.alternate {
if article.permalink == nil {
article.permalink = urlString
}
}
else if rel == XMLString.enclosure {
if let enclosure = enclosure(urlString, attributes) {
article.addEnclosure(enclosure)
}
}
}
func enclosure(_ urlString: String, _ attributes: StringDictionary) -> RSSEnclosure? {
let enclosure = RSSEnclosure(url: urlString)
enclosure.title = attributes[XMLString.title]
enclosure.mimeType = attributes[XMLString.type]
if let lengthString = attributes[XMLString.length] {
enclosure.length = Int(lengthString)
}
return enclosure
}
func addXHTMLTag(_ localName: XMLPointer) {
guard var xhtmlString else {
assertionFailure("xhtmlString must not be nil when in addXHTMLTag.")
return
}
guard let name = String(xmlPointer: localName) else {
assertionFailure("Unexpected failure converting XMLPointer to String in addXHTMLTag.")
return
}
xhtmlString.append("<")
xhtmlString.append(name)
if let currentAttributes, currentAttributes.count > 0 {
for (key, value) in currentAttributes {
xhtmlString.append(" ")
xhtmlString.append(key)
xhtmlString.append("=\"")
let encodedValue = value.replacingOccurrences(of: "\"", with: "&quot;")
xhtmlString.append(encodedValue)
xhtmlString.append("\"")
}
}
xhtmlString.append(">")
}
}
extension AtomParser: SAXParserDelegate {
public func saxParser(_ saxParser: SAXParser, xmlStartElement localName: XMLPointer, prefix: XMLPointer?, uri: XMLPointer?, namespaceCount: Int, namespaces: UnsafePointer<XMLPointer?>?, attributeCount: Int, attributesDefaultedCount: Int, attributes: UnsafePointer<XMLPointer?>?) {
if endFeedFound {
return
}
let xmlAttributes = saxParser.attributesDictionary(attributes, attributeCount: attributeCount) ?? StringDictionary()
attributesStack.append(xmlAttributes)
if parsingXHTML {
addXHTMLTag(localName)
return
}
if SAXEqualTags(localName, XMLName.entry) {
parsingArticle = true
addArticle()
return
}
if SAXEqualTags(localName, XMLName.author) {
parsingAuthor = true
currentAuthor = RSSAuthor()
return
}
if SAXEqualTags(localName, XMLName.source) {
parsingSource = true
return
}
let isContentTag = SAXEqualTags(localName, XMLName.content)
let isSummaryTag = SAXEqualTags(localName, XMLName.summary)
if parsingArticle && (isContentTag || isSummaryTag) {
if isContentTag {
currentArticle?.language = xmlAttributes["xml:lang"]
}
let contentType = xmlAttributes["type"];
if contentType == "xhtml" {
parsingXHTML = true
xhtmlString = ""
return
}
}
if !parsingArticle && SAXEqualTags(localName, XMLName.link) {
addFeedLink()
return
}
if SAXEqualTags(localName, XMLName.feed) {
addFeedLanguage()
}
saxParser.beginStoringCharacters()
}
public func saxParser(_ saxParser: SAXParser, xmlEndElement localName: XMLPointer, prefix: XMLPointer?, uri: XMLPointer?) {
if SAXEqualTags(localName, XMLName.feed) {
endFeedFound = true
return
}
if endFeedFound {
return
}
if parsingXHTML {
let isContentTag = SAXEqualTags(localName, XMLName.content)
let isSummaryTag = SAXEqualTags(localName, XMLName.summary)
if parsingArticle && (isContentTag || isSummaryTag) {
if isContentTag {
currentArticle?.body = xhtmlString
}
else if isSummaryTag {
if (currentArticle?.body?.count ?? 0) < 1 {
currentArticle?.body = xhtmlString
}
}
}
if isContentTag || isSummaryTag {
parsingXHTML = false
}
if var xhtmlString {
if let localNameString = String(xmlPointer: localName) {
xhtmlString.append("</")
xhtmlString.append(localNameString)
xhtmlString.append(">")
}
} else {
assertionFailure("xhtmlString must not be nil when parsingXHTML in xmlEndElement.")
}
}
else if parsingAuthor {
if SAXEqualTags(localName, XMLName.author) {
parsingAuthor = false
if let currentAuthor, !currentAuthor.isEmpty() {
currentArticle?.addAuthor(currentAuthor)
}
currentAuthor = nil
}
else if SAXEqualTags(localName, XMLName.name) {
currentAuthor?.name = saxParser.currentStringWithTrimmedWhitespace
}
else if SAXEqualTags(localName, XMLName.email) {
currentAuthor?.emailAddress = saxParser.currentStringWithTrimmedWhitespace
}
else if SAXEqualTags(localName, XMLName.uri) {
currentAuthor?.url = saxParser.currentStringWithTrimmedWhitespace
}
}
else if SAXEqualTags(localName, XMLName.entry) {
parsingArticle = false
}
else if parsingArticle && !parsingSource {
addArticleElement(saxParser, localName, prefix)
}
else if SAXEqualTags(localName, XMLName.source) {
parsingSource = false
}
else if !parsingArticle && !parsingSource && SAXEqualTags(localName, XMLName.title) {
addFeedTitle(saxParser)
}
_ = attributesStack.popLast()
}
public func saxParser(_ saxParser: SAXParser, xmlCharactersFound: XMLPointer, count: Int) {
guard parsingXHTML else {
return
}
guard var s = String(xmlPointer: xmlCharactersFound, count: count) else {
return
}
// libxml decodes all entities; we need to re-encode certain characters
// (<, >, and &) when inside XHTML text content.
s = s.replacingOccurrences(of: "<", with: "&;lt;")
s = s.replacingOccurrences(of: ">", with: "&;gt;")
s = s.replacingOccurrences(of: "&", with: "&amp;")
xhtmlString = s
}
}

View File

@@ -0,0 +1,111 @@
//
// RSSArticle.swift
//
//
// Created by Brent Simmons on 8/27/24.
//
import Foundation
//import FoundationExtras
final class RSSArticle {
var feedURL: String
/// An RSS guid, if present, or calculated from other attributes.
/// Should be unique to the feed, but not necessarily unique
/// across different feeds. (Not suitable for a database ID.)
lazy var articleID: String = {
if let guid {
return guid
}
return calculatedArticleID()
}()
var guid: String?
var title: String?
var body: String?
var link: String?
var permalink: String?
var authors: [RSSAuthor]?
var enclosures: [RSSEnclosure]?
var datePublished: Date?
var dateModified: Date?
var dateParsed: Date
var language: String?
init(_ feedURL: String) {
self.feedURL = feedURL
self.dateParsed = Date()
}
func addEnclosure(_ enclosure: RSSEnclosure) {
if enclosures == nil {
enclosures = [RSSEnclosure]()
}
enclosures!.append(enclosure)
}
func addAuthor(_ author: RSSAuthor) {
if authors == nil {
authors = [RSSAuthor]()
}
authors!.append(author)
}
}
private extension RSSArticle {
func calculatedArticleID() -> String {
// Concatenate a combination of properties when no guid. Then hash the result.
// In general, feeds should have guids. When they don't, re-runs are very likely,
// because there's no other 100% reliable way to determine identity.
// This is intended to create an ID unique inside a feed, but not globally unique.
// Not suitable for a database ID, in other words.
var s = ""
let datePublishedTimeStampString: String? = {
guard let datePublished else {
return nil
}
return String(format: "%.0f", datePublished.timeIntervalSince1970)
}()
// Ideally we have a permalink and a pubDate.
// Either one would probably be a good guid, but together they should be rock-solid.
// (In theory. Feeds are buggy, though.)
if let permalink, !permalink.isEmpty, let datePublishedTimeStampString {
s.append(permalink)
s.append(datePublishedTimeStampString)
}
else if let link, !link.isEmpty, let datePublishedTimeStampString {
s.append(link)
s.append(datePublishedTimeStampString)
}
else if let title, !title.isEmpty, let datePublishedTimeStampString {
s.append(title)
s.append(datePublishedTimeStampString)
}
else if let datePublishedTimeStampString {
s.append(datePublishedTimeStampString)
}
else if let permalink, !permalink.isEmpty {
s.append(permalink)
}
else if let link, !link.isEmpty {
s.append(link)
}
else if let title, !title.isEmpty {
s.append(title)
}
else if let body, !body.isEmpty {
s.append(body)
}
return s.md5String
}
}

View File

@@ -0,0 +1,40 @@
//
// RSSAuthor.swift
//
//
// Created by Brent Simmons on 8/27/24.
//
import Foundation
final class RSSAuthor {
var name: String?
var url: String?
var avatarURL: String?
var emailAddress: String?
init(name: String? = nil, url: String? = nil, avatarURL: String? = nil, emailAddress: String? = nil) {
self.name = name
self.url = url
self.avatarURL = avatarURL
self.emailAddress = emailAddress
}
/// Use when the actual property is unknown. Guess based on contents of the string. (This is common with RSS.)
convenience init(singleString: String) {
if singleString.contains("@") {
self.init(emailAddress: singleString)
} else if singleString.lowercased().hasPrefix("http") {
self.init(url: singleString)
} else {
self.init(name: singleString)
}
}
func isEmpty() -> Bool {
name == nil && url == nil && avatarURL == nil && emailAddress == nil
}
}

View File

@@ -0,0 +1,20 @@
//
// RSSEnclosure.swift
//
//
// Created by Brent Simmons on 8/27/24.
//
import Foundation
final class RSSEnclosure {
var url: String
var length: Int?
var mimeType: String?
var title: String?
init(url: String) {
self.url = url
}
}

View File

@@ -0,0 +1,22 @@
//
// RSSFeed.swift
//
//
// Created by Brent Simmons on 8/27/24.
//
import Foundation
final class RSSFeed {
var urlString: String
var title: String?
var link: String?
var language: String?
var articles: [RSSArticle]?
init(urlString: String) {
self.urlString = urlString
}
}

View File

@@ -0,0 +1,75 @@
//
// RSSFeedTransformer.swift
// Parser
//
// Created by Brent Simmons on 6/25/17.
// Copyright © 2017 Ranchero Software, LLC. All rights reserved.
//
import Foundation
struct RSSFeedTransformer {
/// Turn an internal RSSFeed into a public ParsedFeed.
static func parsedFeed(with feed: RSSFeed, feedType: FeedType) -> ParsedFeed {
let items = parsedItems(feed.articles)
return ParsedFeed(type: feedType, title: feed.title, homePageURL: feed.link, feedURL: feed.urlString, language: feed.language, feedDescription: nil, nextURL: nil, iconURL: nil, faviconURL: nil, authors: nil, expired: false, hubs: nil, items: items)
}
}
private extension RSSFeedTransformer {
static func parsedItems(_ articles: [RSSArticle]?) -> Set<ParsedItem> {
guard let articles else {
return Set<ParsedItem>()
}
return Set(articles.map(parsedItem))
}
static func parsedItem(_ article: RSSArticle) -> ParsedItem {
let uniqueID = article.articleID
let url = article.permalink
let externalURL = article.link
let title = article.title
let language = article.language
let contentHTML = article.body
let datePublished = article.datePublished
let dateModified = article.dateModified
let authors = parsedAuthors(article.authors)
let attachments = parsedAttachments(article.enclosures)
return ParsedItem(syncServiceID: nil, uniqueID: uniqueID, feedURL: article.feedURL, url: url, externalURL: externalURL, title: title, language: language, contentHTML: contentHTML, contentText: nil, summary: nil, imageURL: nil, bannerImageURL: nil, datePublished: datePublished, dateModified: dateModified, authors: authors, tags: nil, attachments: attachments)
}
static func parsedAuthors(_ authors: [RSSAuthor]?) -> Set<ParsedAuthor>? {
guard let authors = authors, !authors.isEmpty else {
return nil
}
let transformedAuthors = authors.compactMap { (author) -> ParsedAuthor? in
return ParsedAuthor(name: author.name, url: author.url, avatarURL: nil, emailAddress: author.emailAddress)
}
return transformedAuthors.isEmpty ? nil : Set(transformedAuthors)
}
static func parsedAttachments(_ enclosures: [RSSEnclosure]?) -> Set<ParsedAttachment>? {
guard let enclosures = enclosures, !enclosures.isEmpty else {
return nil
}
let attachments = enclosures.compactMap { (enclosure) -> ParsedAttachment? in
let sizeInBytes = (enclosure.length ?? 0) > 0 ? enclosure.length : nil
return ParsedAttachment(url: enclosure.url, mimeType: enclosure.mimeType, title: nil, sizeInBytes: sizeInBytes, durationInSeconds: nil)
}
return attachments.isEmpty ? nil : Set(attachments)
}
}

View File

@@ -0,0 +1,366 @@
//
// RSSParser.swift
// Parser
//
// Created by Brent Simmons on 6/25/17.
// Copyright © 2017 Ranchero Software, LLC. All rights reserved.
//
import Foundation
import RSCore
public final class RSSParser {
private let feedURL: String
private let data: Data
private let feed: RSSFeed
private var articles = [RSSArticle]()
private var currentArticle: RSSArticle? {
articles.last
}
private var endRSSFound = false
private var isRDF = false
private var parsingArticle = false
private var parsingChannelImage = false
private var parsingAuthor = false
private var currentAttributes: StringDictionary?
static func parsedFeed(urlString: String, data: Data) -> RSSFeed {
let parser = RSSParser(urlString: urlString, data: data)
parser.parse()
return parser.feed
}
init(urlString: String, data: Data) {
self.feedURL = urlString
self.data = data
self.feed = RSSFeed(urlString: urlString)
}
}
private extension RSSParser {
func parse() {
let saxParser = SAXParser(delegate: self, data: data)
saxParser.parse()
feed.articles = articles
}
private struct XMLName {
static let uppercaseRDF = "RDF".utf8CString
static let item = "item".utf8CString
static let guid = "guid".utf8CString
static let enclosure = "enclosure".utf8CString
static let image = "image".utf8CString
static let author = "author".utf8CString
static let rss = "rss".utf8CString
static let link = "link".utf8CString
static let title = "title".utf8CString
static let language = "language".utf8CString
static let dc = "dc".utf8CString
static let content = "content".utf8CString
static let encoded = "encoded".utf8CString
static let creator = "creator".utf8CString
static let date = "date".utf8CString
static let pubDate = "pubDate".utf8CString
static let description = "description".utf8CString
}
func addFeedElement(_ saxParser: SAXParser, _ localName: XMLPointer, _ prefix: XMLPointer?) {
guard prefix == nil else {
return
}
if SAXEqualTags(localName, XMLName.link) {
if feed.link == nil {
feed.link = saxParser.currentString
}
}
else if SAXEqualTags(localName, XMLName.title) {
feed.title = saxParser.currentString
}
else if SAXEqualTags(localName, XMLName.language) {
feed.language = saxParser.currentString
}
}
func addArticle() {
let article = RSSArticle(feedURL)
articles.append(article)
}
func addArticleElement(_ saxParser: SAXParser, _ localName: XMLPointer, _ prefix: XMLPointer?) {
guard let currentArticle else {
return
}
if let prefix, SAXEqualTags(prefix, XMLName.dc) {
addDCElement(saxParser, localName, currentArticle)
return
}
if let prefix, SAXEqualTags(prefix, XMLName.content) && SAXEqualTags(localName, XMLName.encoded) {
if let currentString = saxParser.currentString, !currentString.isEmpty {
currentArticle.body = currentString
}
return
}
guard prefix == nil else {
return
}
if let currentString = saxParser.currentString {
if SAXEqualTags(localName, XMLName.guid) {
addGuid(currentString, currentArticle)
}
else if SAXEqualTags(localName, XMLName.author) {
addAuthorWithString(currentString, currentArticle)
}
else if SAXEqualTags(localName, XMLName.link) {
currentArticle.link = urlString(currentString)
}
else if SAXEqualTags(localName, XMLName.description) {
if currentArticle.body == nil {
currentArticle.body = currentString
}
}
else if !parsingAuthor && SAXEqualTags(localName, XMLName.title) {
currentArticle.title = currentString
}
else if SAXEqualTags(localName, XMLName.pubDate) {
currentArticle.datePublished = currentDate(saxParser)
}
}
else if SAXEqualTags(localName, XMLName.enclosure), let currentAttributes {
addEnclosure(currentAttributes, currentArticle)
}
}
func addDCElement(_ saxParser: SAXParser, _ localName: XMLPointer, _ currentArticle: RSSArticle) {
if SAXEqualTags(localName, XMLName.creator) {
if let currentString = saxParser.currentString {
addAuthorWithString(currentString, currentArticle)
}
}
else if SAXEqualTags(localName, XMLName.date) {
currentArticle.datePublished = currentDate(saxParser)
}
}
static let isPermalinkKey = "isPermaLink"
static let isPermalinkLowercaseKey = "ispermalink"
static let falseValue = "false"
func addGuid(_ guid: String, _ currentArticle: RSSArticle) {
currentArticle.guid = guid
guard let currentAttributes else {
return
}
let isPermaLinkValue: String? = {
if let value = currentAttributes[Self.isPermalinkKey] {
return value
}
// Allow for `ispermalink`, `isPermalink`, etc.
for (key, value) in currentAttributes {
if key.lowercased() == Self.isPermalinkLowercaseKey {
return value
}
}
return nil
}()
// Spec: `isPermaLink is optional, its default value is true.`
// https://cyber.harvard.edu/rss/rss.html#ltguidgtSubelementOfLtitemgt
// Return only if non-nil and equal to false  otherwise its a permalink.
if let isPermaLinkValue, isPermaLinkValue == Self.falseValue {
return
}
// Feed bug found in the wild: using a guid thats not really a permalink
// and not realizing that `isPermaLink` is true by default.
if stringIsProbablyAURLOrRelativePath(guid) {
currentArticle.permalink = urlString(guid)
}
}
func stringIsProbablyAURLOrRelativePath(_ s: String) -> Bool {
// The RSS guid is defined as a permalink, except when it appears like this:
// `<guid isPermaLink="false">someidentifier</guid>`
// However, people often seem to think its *not* a permalink by default, even
// though it is. So we try to detect the situation where the value is not a URL string,
// and not even a relative path. This may need to evolve over time.
if !s.contains("/") {
// This seems to be just about the best possible check.
// Bad guids are often just integers, for instance.
return false
}
if s.lowercased().hasPrefix("tag:") {
// A common non-URL guid form starts with `tag:`.
return false
}
return true
}
/// Do best attempt at turning a string into a URL string.
///
/// If it already appears to be a URL, return it.
/// Otherwise, treat it like a relative URL and resolve using
/// the URL of the home page of the feed (if available)
/// or the URL of the feed.
///
/// The returned value is not guaranteed to be a valid URL string.
/// Its a best attempt without going to heroic lengths.
func urlString(_ s: String) -> String {
if s.lowercased().hasPrefix("http") {
return s
}
let baseURLString = feed.link ?? feedURL
guard let baseURL = URL(string: baseURLString) else {
return s
}
guard let resolvedURL = URL(string: s, relativeTo: baseURL) else {
return s
}
return resolvedURL.absoluteString
}
func addAuthorWithString(_ authorString: String, _ currentArticle: RSSArticle) {
if authorString.isEmpty {
return
}
let author = RSSAuthor(singleString: authorString)
currentArticle.addAuthor(author)
}
private struct EnclosureKey {
static let url = "url"
static let length = "length"
static let type = "type"
}
func addEnclosure(_ attributes: StringDictionary, _ currentArticle: RSSArticle) {
guard let url = attributes[EnclosureKey.url], !url.isEmpty else {
return
}
let enclosure = RSSEnclosure(url: url)
if let lengthValue = attributes[EnclosureKey.length], let length = Int(lengthValue) {
enclosure.length = length
}
enclosure.mimeType = attributes[EnclosureKey.type]
currentArticle.addEnclosure(enclosure)
}
func currentDate(_ saxParser: SAXParser) -> Date? {
guard let data = saxParser.currentCharacters else {
return nil
}
return DateParser.date(data: data)
}
}
extension RSSParser: SAXParserDelegate {
static let rdfAbout = "rdf:about"
public func saxParser(_ saxParser: SAXParser, xmlStartElement localName: XMLPointer, prefix: XMLPointer?, uri: XMLPointer?, namespaceCount: Int, namespaces: UnsafePointer<XMLPointer?>?, attributeCount: Int, attributesDefaultedCount: Int, attributes: UnsafePointer<XMLPointer?>?) {
if endRSSFound {
return
}
if SAXEqualTags(localName, XMLName.uppercaseRDF) {
isRDF = true
return
}
var xmlAttributes: StringDictionary? = nil
if (isRDF && SAXEqualTags(localName, XMLName.item)) || SAXEqualTags(localName, XMLName.guid) || SAXEqualTags(localName, XMLName.enclosure) {
xmlAttributes = saxParser.attributesDictionary(attributes, attributeCount: attributeCount)
}
if currentAttributes != xmlAttributes {
currentAttributes = xmlAttributes
}
if prefix == nil && SAXEqualTags(localName, XMLName.item) {
addArticle()
parsingArticle = true
if isRDF, let rdfGuid = xmlAttributes?[Self.rdfAbout], let currentArticle { // RSS 1.0 guid
currentArticle.guid = rdfGuid
currentArticle.permalink = rdfGuid
}
}
else if prefix == nil && SAXEqualTags(localName, XMLName.image) {
parsingChannelImage = true
}
else if prefix == nil && SAXEqualTags(localName, XMLName.author) {
if parsingArticle {
parsingAuthor = true
}
}
if !parsingChannelImage {
saxParser.beginStoringCharacters()
}
}
public func saxParser(_ saxParser: SAXParser, xmlEndElement localName: XMLPointer, prefix: XMLPointer?, uri: XMLPointer?) {
if endRSSFound {
return
}
if isRDF && SAXEqualTags(localName, XMLName.uppercaseRDF) {
endRSSFound = true
}
else if SAXEqualTags(localName, XMLName.rss) {
endRSSFound = true
}
else if SAXEqualTags(localName, XMLName.image) {
parsingChannelImage = false
}
else if SAXEqualTags(localName, XMLName.item) {
parsingArticle = false
}
else if parsingArticle {
addArticleElement(saxParser, localName, prefix)
if SAXEqualTags(localName, XMLName.author) {
parsingAuthor = false
}
}
else if !parsingChannelImage {
addFeedElement(saxParser, localName, prefix)
}
}
public func saxParser(_ saxParser: SAXParser, xmlCharactersFound: XMLPointer, count: Int) {
// Required method.
}
}

View File

@@ -0,0 +1,12 @@
//
// JSONDictionary.swift
// Parser
//
// Created by Brent Simmons on 6/24/17.
// Copyright © 2017 Ranchero Software, LLC. All rights reserved.
//
import Foundation
public typealias JSONDictionary = [String: Any]
public typealias JSONArray = [JSONDictionary]

View File

@@ -0,0 +1,27 @@
//
// JSONUtilities.swift
// Parser
//
// Created by Brent Simmons on 12/10/17.
// Copyright © 2017 Ranchero Software, LLC. All rights reserved.
//
import Foundation
public struct JSONUtilities {
public static func object(with data: Data) -> Any? {
return try? JSONSerialization.jsonObject(with: data)
}
public static func dictionary(with data: Data) -> JSONDictionary? {
return object(with: data) as? JSONDictionary
}
public static func array(with data: Data) -> JSONArray? {
return object(with: data) as? JSONArray
}
}

View File

@@ -0,0 +1,446 @@
//
// HTMLEntityDecoder.swift
//
//
// Created by Brent Simmons on 9/26/24.
//
import Foundation
public final class HTMLEntityDecoder {
public static func decodedString(_ encodedString: String) -> String {
var didDecodeAtLeastOneEntity = false
// If `withContiguousStorageIfAvailable` works, then we can avoid copying memory.
var result: String? = encodedString.utf8.withContiguousStorageIfAvailable { buffer in
return decodedEntities(buffer, &didDecodeAtLeastOneEntity)
}
if result == nil {
let d = Data(encodedString.utf8)
result = d.withUnsafeBytes { bytes in
let buffer = bytes.bindMemory(to: UInt8.self)
return decodedEntities(buffer, &didDecodeAtLeastOneEntity)
}
}
if let result {
if didDecodeAtLeastOneEntity {
return result
}
return encodedString
}
assertionFailure("Expected result but got nil.")
return encodedString
}
}
private let ampersandCharacter = Character("&").asciiValue!
private let numberSignCharacter = Character("#").asciiValue!
private let xCharacter = Character("x").asciiValue!
private let XCharacter = Character("X").asciiValue!
private let semicolonCharacter = Character(";").asciiValue!
private let zeroCharacter = Character("0").asciiValue!
private let nineCharacter = Character("9").asciiValue!
private let aCharacter = Character("a").asciiValue!
private let fCharacter = Character("f").asciiValue!
private let zCharacter = Character("z").asciiValue!
private let ACharacter = Character("A").asciiValue!
private let FCharacter = Character("F").asciiValue!
private let ZCharacter = Character("Z").asciiValue!
private let maxUnicodeNumber = 0x10FFFF
private func decodedEntities(_ sourceBuffer: UnsafeBufferPointer<UInt8>, _ didDecodeAtLeastOneEntity: inout Bool) -> String {
let byteCount = sourceBuffer.count
let resultBufferByteCount = byteCount + 1
// Allocate a destination buffer for the result string. It can be the same size
// as the source string buffer, since decoding HTML entities will only make it smaller.
// Same size plus 1, that is, for null-termination.
let resultBuffer = UnsafeMutableRawPointer.allocate(byteCount: resultBufferByteCount, alignment: MemoryLayout<UInt8>.alignment)
defer {
resultBuffer.deallocate()
}
resultBuffer.initializeMemory(as: UInt8.self, repeating: 0, count: resultBufferByteCount)
let result = resultBuffer.assumingMemoryBound(to: UInt8.self)
var sourceLocation = 0
var resultLocation = 0
while sourceLocation < byteCount {
let ch = sourceBuffer[sourceLocation]
var decodedEntity: String? = nil
if ch == ampersandCharacter {
decodedEntity = decodedEntityValue(sourceBuffer, byteCount, &sourceLocation)
}
if let decodedEntity {
addDecodedEntity(decodedEntity, result, byteCount, &resultLocation)
didDecodeAtLeastOneEntity = true
sourceLocation += 1
continue
}
result[resultLocation] = ch
resultLocation += 1
sourceLocation += 1
}
let cString = resultBuffer.assumingMemoryBound(to: CChar.self)
return String(cString: cString)
}
private func addDecodedEntity(_ decodedEntity: String, _ result: UnsafeMutablePointer<UInt8>, _ resultByteCount: Int, _ resultLocation: inout Int) {
let utf8Bytes = Array(decodedEntity.utf8)
precondition(resultLocation + utf8Bytes.count <= resultByteCount)
for byte in utf8Bytes {
result[resultLocation] = byte
resultLocation += 1
}
}
private func decodedEntityValue(_ buffer: UnsafeBufferPointer<UInt8>, _ byteCount: Int, _ sourceLocation: inout Int) -> /*[UInt8]?*/ String? {
guard let rawEntity = rawEntityValue(buffer, byteCount, &sourceLocation) else {
return nil
}
return decodedRawEntityValue(rawEntity)
}
private func decodedRawEntityValue(_ rawEntity: ContiguousArray<UInt8>) -> String? {
var entityCharacters = [UInt8]()
for character in rawEntity {
if character == 0 {
break
}
entityCharacters.append(character)
}
let key = String(decoding: entityCharacters, as: UTF8.self)
if let entityString = entitiesDictionary[key] {
return entityString
}
if rawEntity[0] == numberSignCharacter {
if let entityString = decodedNumericEntity(rawEntity) {
return entityString
}
}
return nil
}
private func decodedNumericEntity(_ rawEntity: ContiguousArray<UInt8>) -> String? {
assert(rawEntity[0] == numberSignCharacter)
var decodedNumber: UInt32?
if rawEntity[1] == xCharacter || rawEntity[1] == XCharacter { // Hex?
decodedNumber = decodedHexEntity(rawEntity)
}
else {
decodedNumber = decodedDecimalEntity(rawEntity)
}
if let decodedNumber {
return stringWithValue(decodedNumber)
}
return nil
}
private func decodedHexEntity(_ rawEntity: ContiguousArray<UInt8>) -> UInt32? {
assert(rawEntity[0] == numberSignCharacter)
assert(rawEntity[1] == xCharacter || rawEntity[1] == XCharacter)
var number: UInt32 = 0
var i = 0
for byte in rawEntity {
if i < 2 { // Skip first two characters: #x or #X
i += 1
continue
}
if byte == 0 { // rawEntity is null-terminated
break
}
var digit: UInt32?
switch byte {
case zeroCharacter...nineCharacter: // 0-9
digit = UInt32(byte - zeroCharacter)
case aCharacter...fCharacter: // a-f
digit = UInt32((byte - aCharacter) + 10)
case ACharacter...FCharacter: // a-f
digit = UInt32((byte - ACharacter) + 10)
default:
return nil
}
guard let digit else {
return nil // Shouldnt get here  handled by default case  but we need to bind digit
}
number = (number * 16) + digit
if number > maxUnicodeNumber {
return nil
}
}
if number == 0 {
return nil
}
return number
}
private func decodedDecimalEntity(_ rawEntity: ContiguousArray<UInt8>) -> UInt32? {
assert(rawEntity[0] == numberSignCharacter)
assert(rawEntity[1] != xCharacter && rawEntity[1] != XCharacter) // not hex
var number: UInt32 = 0
var isFirstCharacter = true
// Convert, for instance, [51, 57] to 39
for byte in rawEntity {
if isFirstCharacter { // first character is #
isFirstCharacter = false
continue
}
if byte == 0 { // rawEntity is null-terminated
break
}
// Be sure its a digit 0-9
if byte < zeroCharacter || byte > nineCharacter {
return nil
}
let digit = UInt32(byte - zeroCharacter)
number = (number * 10) + digit
if number > maxUnicodeNumber {
return nil
}
}
if number == 0 {
return nil
}
return number
}
private func rawEntityValue(_ buffer: UnsafeBufferPointer<UInt8>, _ byteCount: Int, _ sourceLocation: inout Int) -> ContiguousArray<UInt8>? {
// sourceLocation points to the & character.
let savedSourceLocation = sourceLocation
let maxEntityCharacters = 36 // Longest current entity is &CounterClockwiseContourIntegral;
var entityCharacters: ContiguousArray<UInt8> = [0, 0, 0, 0, 0,
0, 0, 0, 0, 0,
0, 0, 0, 0, 0,
0, 0, 0, 0, 0, // 20 characters
0, 0, 0, 0, 0,
0, 0, 0, 0, 0,
0, 0, 0, 0, 0, // 35 characters
0] // nil-terminated last character
var entityCharactersIndex = 0
while true {
sourceLocation += 1
if sourceLocation >= byteCount || entityCharactersIndex >= maxEntityCharacters { // did not parse entity
sourceLocation = savedSourceLocation
return nil
}
let ch = buffer[sourceLocation]
if ch == semicolonCharacter { // End of entity?
return entityCharacters
}
// Make sure character is in 0-9, A-Z, a-z, #
if ch < zeroCharacter && ch != numberSignCharacter {
return nil
}
if ch > nineCharacter && ch < ACharacter {
return nil
}
if ch > ZCharacter && ch < aCharacter {
return nil
}
if ch > zCharacter {
return nil
}
entityCharacters[entityCharactersIndex] = ch
entityCharactersIndex += 1
}
}
private func stringWithValue(_ value: UInt32) -> String? {
// From WebCore's HTMLEntityParser
let windowsLatin1ExtensionArray: [UInt32] = [
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 // 98-9F
]
var modifiedValue = value
if value >= 128 && value < 160 {
modifiedValue = windowsLatin1ExtensionArray[Int(modifiedValue - 0x80)]
}
modifiedValue = CFSwapInt32HostToLittle(modifiedValue)
let data = Data(bytes: &modifiedValue, count: MemoryLayout.size(ofValue: modifiedValue))
return String(data: data, encoding: .utf32LittleEndian)
}
private let entitiesDictionary =
[
"AElig": "Æ",
"Aacute": "Á",
"Acirc": "Â",
"Agrave": "À",
"Aring": "Å",
"Atilde": "Ã",
"Auml": "Ä",
"Ccedil": "Ç",
"Dstrok": "Ð",
"ETH": "Ð",
"Eacute": "É",
"Ecirc": "Ê",
"Egrave": "È",
"Euml": "Ë",
"Iacute": "Í",
"Icirc": "Î",
"Igrave": "Ì",
"Iuml": "Ï",
"Ntilde": "Ñ",
"Oacute": "Ó",
"Ocirc": "Ô",
"Ograve": "Ò",
"Oslash": "Ø",
"Otilde": "Õ",
"Ouml": "Ö",
"Pi": "Π",
"THORN": "Þ",
"Uacute": "Ú",
"Ucirc": "Û",
"Ugrave": "Ù",
"Uuml": "Ü",
"Yacute": "Y",
"aacute": "á",
"acirc": "â",
"acute": "´",
"aelig": "æ",
"agrave": "à",
"amp": "&",
"apos": "'",
"aring": "å",
"atilde": "ã",
"auml": "ä",
"brkbar": "¦",
"brvbar": "¦",
"ccedil": "ç",
"cedil": "¸",
"cent": "¢",
"copy": "©",
"curren": "¤",
"deg": "°",
"die": "¨",
"divide": "÷",
"eacute": "é",
"ecirc": "ê",
"egrave": "è",
"eth": "ð",
"euml": "ë",
"euro": "",
"frac12": "½",
"frac14": "¼",
"frac34": "¾",
"gt": ">",
"hearts": "",
"hellip": "",
"iacute": "í",
"icirc": "î",
"iexcl": "¡",
"igrave": "ì",
"iquest": "¿",
"iuml": "ï",
"laquo": "«",
"ldquo": "",
"lsquo": "",
"lt": "<",
"macr": "¯",
"mdash": "",
"micro": "µ",
"middot": "·",
"ndash": "",
"not": "¬",
"ntilde": "ñ",
"oacute": "ó",
"ocirc": "ô",
"ograve": "ò",
"ordf": "ª",
"ordm": "º",
"oslash": "ø",
"otilde": "õ",
"ouml": "ö",
"para": "",
"pi": "π",
"plusmn": "±",
"pound": "£",
"quot": "\"",
"raquo": "»",
"rdquo": "",
"reg": "®",
"rsquo": "",
"sect": "§",
"shy": stringWithValue(173),
"sup1": "¹",
"sup2": "²",
"sup3": "³",
"szlig": "ß",
"thorn": "þ",
"times": "×",
"trade": "",
"uacute": "ú",
"ucirc": "û",
"ugrave": "ù",
"uml": "¨",
"uuml": "ü",
"yacute": "y",
"yen": "¥",
"yuml": "ÿ",
"infin": "",
"nbsp": stringWithValue(160)
]

View File

@@ -0,0 +1,22 @@
//
// HTMLLink.swift
//
//
// Created by Brent Simmons on 9/21/24.
//
import Foundation
public final class HTMLLink {
public var urlString: String? // Absolute URL string
public var text: String?
public var title: String? // Title attribute inside anchor tag
init(urlString: String? = nil, text: String? = nil, title: String? = nil) {
self.urlString = urlString
self.text = text
self.title = title
}
}

View File

@@ -0,0 +1,120 @@
//
// HTMLLinkParser.swift
//
//
// Created by Brent Simmons on 9/21/24.
//
import Foundation
import RSCore
import os
public final class HTMLLinkParser {
public private(set) var links = [HTMLLink]()
private static let logger = Logger(subsystem: Bundle.main.bundleIdentifier!, category: "HTMLLinkParser")
private let parserData: ParserData
private let baseURL: URL?
public static func htmlLinks(with parserData: ParserData) -> [HTMLLink] {
let parser = HTMLLinkParser(parserData)
parser.parse()
return parser.links
}
init(_ parserData: ParserData) {
self.parserData = parserData
self.baseURL = URL(string: parserData.url)
}
}
private extension HTMLLinkParser {
func parse() {
let htmlParser = SAXHTMLParser(delegate: self, data: parserData.data)
htmlParser.parse()
}
}
extension HTMLLinkParser: SAXHTMLParserDelegate {
private var currentLink: HTMLLink? {
links.last
}
private struct HTMLAttributeName {
static let href = "href"
static let title = "title"
}
private func title(with attributesDictionary: StringDictionary) -> String? {
attributesDictionary.object(forCaseInsensitiveKey: HTMLAttributeName.title)
}
private func urlString(with attributesDictionary: StringDictionary) -> String? {
guard let href = attributesDictionary.object(forCaseInsensitiveKey: HTMLAttributeName.href), !href.isEmpty else {
return nil
}
guard let baseURL, let absoluteURL = URL(string: href, relativeTo: baseURL) else {
Self.logger.info("Expected to create URL but got nil with \(href)")
return nil
}
return absoluteURL.absoluteString
}
private func handleLinkAttributes(_ attributesDictionary: StringDictionary) {
guard let currentLink else {
assertionFailure("currentLink must not be nil")
return
}
currentLink.urlString = urlString(with: attributesDictionary)
currentLink.title = title(with: attributesDictionary)
}
private struct HTMLName {
static let a = "a".utf8CString
}
public func saxHTMLParser(_ saxHTMLParser: SAXHTMLParser, startElement name: XMLPointer, attributes: UnsafePointer<XMLPointer?>?) {
guard SAXEqualTags(name, HTMLName.a) else {
return
}
let link = HTMLLink()
links.append(link)
if let attributesDictionary = saxHTMLParser.attributesDictionary(attributes) {
handleLinkAttributes(attributesDictionary)
}
saxHTMLParser.beginStoringCharacters()
}
public func saxHTMLParser(_ saxHTMLParser: SAXHTMLParser, endElement name: XMLPointer) {
guard SAXEqualTags(name, HTMLName.a) else {
return
}
guard let currentLink else {
assertionFailure("currentLink must not be nil.")
return
}
currentLink.text = saxHTMLParser.currentStringWithTrimmedWhitespace
}
public func saxHTMLParser(_: SAXHTMLParser, charactersFound: XMLPointer, count: Int) {
// Nothing needed.
}
}

View File

@@ -0,0 +1,437 @@
//
// HTMLMetadata.swift
//
//
// Created by Brent Simmons on 9/22/24.
//
import Foundation
public final class HTMLMetadata: Sendable {
public let baseURLString: String
public let tags: [HTMLTag]
public let favicons: [HTMLMetadataFavicon]?
public let appleTouchIcons: [HTMLMetadataAppleTouchIcon]?
public let feedLinks: [HTMLMetadataFeedLink]?
public let openGraphProperties: HTMLOpenGraphProperties?
public let twitterProperties: HTMLTwitterProperties?
init(_ urlString: String, _ tags: [HTMLTag]) {
self.baseURLString = urlString
self.tags = tags
self.favicons = Self.resolvedFaviconLinks(urlString, tags)
if let appleTouchIconTags = Self.appleTouchIconTags(tags) {
self.appleTouchIcons = appleTouchIconTags.map { htmlTag in
HTMLMetadataAppleTouchIcon(urlString, htmlTag)
}
}
else {
self.appleTouchIcons = nil
}
if let feedLinkTags = Self.feedLinkTags(tags) {
self.feedLinks = feedLinkTags.map { htmlTag in
HTMLMetadataFeedLink(urlString, htmlTag)
}
}
else {
self.feedLinks = nil
}
self.openGraphProperties = HTMLOpenGraphProperties(urlString, tags)
self.twitterProperties = HTMLTwitterProperties(urlString, tags)
}
static func resolvedFaviconLinks(_ baseURLString: String, _ tags: [HTMLTag]) -> [HTMLMetadataFavicon]? {
guard let linkTags = linkTagsWithMatchingRel("icon", tags) else {
return nil
}
var seenHrefs = [String]()
let favicons: [HTMLMetadataFavicon] = linkTags.compactMap { htmlTag in
let favicon = HTMLMetadataFavicon(baseURLString, htmlTag)
guard let urlString = favicon.urlString else {
return nil
}
guard !seenHrefs.contains(urlString) else {
return nil
}
seenHrefs.append(urlString)
return favicon
}
return favicons.isEmpty ? nil : favicons
}
static func appleTouchIconTags(_ tags: [HTMLTag]) -> [HTMLTag]? {
guard let linkTags = linkTags(tags) else {
return nil
}
guard let appleTouchIconTags = tagsMatchingRelValues(["apple-touch-icon", "apple-touch-icon-precomposed"], linkTags) else {
return nil
}
return appleTouchIconTags.isEmpty ? nil : appleTouchIconTags
}
static func feedLinkTags(_ tags: [HTMLTag]) -> [HTMLTag]? {
guard let alternateLinkTags = linkTagsWithMatchingRel("alternate", tags) else {
return nil
}
let feedLinkTags = alternateLinkTags.filter { tag in
guard let attributes = tag.attributes, let type = attributes.object(forCaseInsensitiveKey: "type"), typeIsFeedType(type) else {
return false
}
guard let urlString = urlString(from: attributes), !urlString.isEmpty else {
return false
}
return true
}
return feedLinkTags.isEmpty ? nil : feedLinkTags
}
static func typeIsFeedType(_ type: String) -> Bool {
let lowerType = type.lowercased()
return lowerType.hasSuffix("/rss+xml") || lowerType.hasSuffix("/atom+xml") || lowerType.hasSuffix("/json")
}
static func linkTags(_ tags: [HTMLTag]) -> [HTMLTag]? {
let linkTags = tags.filter { $0.tagType == .link }
return linkTags.isEmpty ? nil : linkTags
}
static func linkTagsWithMatchingRel(_ valueToMatch: String, _ tags: [HTMLTag]) -> [HTMLTag]? {
// Case-insensitive; matches a whitespace-delimited word
guard let linkTags = linkTags(tags) else {
return nil
}
let tagsWithURLString = linkTags.filter { tag in
guard let attributes = tag.attributes else {
return false
}
guard let urlString = urlString(from: attributes), !urlString.isEmpty else {
return false
}
return true
}
if tagsWithURLString.isEmpty {
return nil
}
guard let matchingTags = tagsMatchingRelValues([valueToMatch], tagsWithURLString) else {
return nil
}
return matchingTags.isEmpty ? nil : matchingTags
}
static func tagsMatchingRelValues(_ valuesToMatch: [String], _ tags: [HTMLTag]) -> [HTMLTag]? {
let lowerValuesToMatch = valuesToMatch.map { $0.lowercased() }
let matchingTags: [HTMLTag] = {
tags.filter { tag in
guard let attributes = tag.attributes else {
return false
}
guard let relValue = relValue(from: attributes) else {
return false
}
let relValues = relValue.components(separatedBy: .whitespacesAndNewlines)
for oneRelValue in relValues {
let oneLowerRelValue = oneRelValue.lowercased()
for lowerValueToMatch in lowerValuesToMatch {
if lowerValueToMatch == oneLowerRelValue {
return true
}
}
}
return false
}
}()
return matchingTags.isEmpty ? nil : matchingTags
}
}
public final class HTMLMetadataAppleTouchIcon: Sendable {
public let rel: String?
public let sizes: String?
public let size: CGSize?
public let urlString: String? // Absolute
init(_ urlString: String, _ tag: HTMLTag) {
guard let attributes = tag.attributes else {
self.rel = nil
self.sizes = nil
self.size = nil
self.urlString = nil
return
}
self.rel = attributes.object(forCaseInsensitiveKey: "rel")
self.urlString = absoluteURLString(from: attributes, baseURL: urlString)
guard let sizes = attributes.object(forCaseInsensitiveKey: "sizes") else {
self.sizes = nil
self.size = nil
return
}
self.sizes = sizes
let sizeComponents = sizes.components(separatedBy: CharacterSet(charactersIn: "x"))
if sizeComponents.count == 2, let width = Double(sizeComponents[0]), let height = Double(sizeComponents[1]) {
self.size = CGSize(width: width, height: height)
}
else {
self.size = nil
}
}
}
public final class HTMLMetadataFeedLink: Sendable {
public let title: String?
public let type: String?
public let urlString: String? // Absolute
init(_ urlString: String, _ tag: HTMLTag) {
guard let attributes = tag.attributes else {
self.title = nil
self.type = nil
self.urlString = nil
return
}
self.urlString = absoluteURLString(from: attributes, baseURL: urlString)
self.title = attributes.object(forCaseInsensitiveKey: "title")
self.type = attributes.object(forCaseInsensitiveKey: "type")
}
}
public final class HTMLMetadataFavicon: Sendable {
public let type: String?
public let urlString: String?
init(_ urlString: String, _ tag: HTMLTag) {
guard let attributes = tag.attributes else {
self.type = nil
self.urlString = nil
return
}
self.urlString = absoluteURLString(from: attributes, baseURL: urlString)
self.type = attributes.object(forCaseInsensitiveKey: "type")
}
}
public final class HTMLOpenGraphProperties: Sendable {
// TODO: the rest. At this writing (Nov. 26, 2017) I just care about og:image.
// See http://ogp.me/
public let image: HTMLOpenGraphImage?
init(_ urlString: String, _ tags: [HTMLTag]) {
self.image = Self.parse(tags)
}
}
private extension HTMLOpenGraphProperties {
private static let ogPrefix = "og:"
struct OGKey {
static let property = "property"
static let content = "content"
}
struct OGValue {
static let ogImage = "og:image"
static let ogImageURL = "og:image:url"
static let ogImageSecureURL = "og:image:secure_url"
static let ogImageType = "og:image:type"
static let ogImageAlt = "og:image:alt"
static let ogImageWidth = "og:image:width"
static let ogImageHeight = "og:image:height"
}
static func parse(_ tags: [HTMLTag]) -> HTMLOpenGraphImage? {
let metaTags = tags.filter { $0.tagType == .meta }
if metaTags.isEmpty {
return nil
}
// HTMLOpenGraphImage properties to fill in.
var url: String?
var secureURL: String?
var mimeType: String?
var width: CGFloat?
var height: CGFloat?
var altText: String?
for tag in metaTags {
guard let attributes = tag.attributes else {
continue
}
guard let propertyName = attributes[OGKey.property], propertyName.hasPrefix(ogPrefix) else {
continue
}
guard let content = attributes[OGKey.content] else {
continue
}
if propertyName == OGValue.ogImage {
url = content
}
else if propertyName == OGValue.ogImageURL {
url = content
}
else if propertyName == OGValue.ogImageSecureURL {
secureURL = content
}
else if propertyName == OGValue.ogImageType {
mimeType = content
}
else if propertyName == OGValue.ogImageAlt {
altText = content
}
else if propertyName == OGValue.ogImageWidth {
if let value = Double(content) {
width = CGFloat(value)
}
}
else if propertyName == OGValue.ogImageHeight {
if let value = Double(content) {
height = CGFloat(value)
}
}
}
if url == nil && secureURL == nil && mimeType == nil && width == nil && height == nil && altText == nil {
return nil
}
return HTMLOpenGraphImage(url: url, secureURL: secureURL, mimeType: mimeType, width: width, height: height, altText: altText)
}
}
public final class HTMLOpenGraphImage: Sendable {
public let url : String?
public let secureURL: String?
public let mimeType: String?
public let width: CGFloat?
public let height: CGFloat?
public let altText: String?
init(url: String?, secureURL: String?, mimeType: String?, width: CGFloat?, height: CGFloat?, altText: String?) {
self.url = url
self.secureURL = secureURL
self.mimeType = mimeType
self.width = width
self.height = height
self.altText = altText
}
}
public final class HTMLTwitterProperties: Sendable {
public let imageURL: String? // twitter:image:src
private struct TwitterKey {
static let name = "name"
static let content = "content"
}
private struct TwitterValue {
static let imageSrc = "twitter:image:src"
}
init(_ urlString: String, _ tags: [HTMLTag]) {
let imageURL: String? = {
for tag in tags {
guard tag.tagType == .meta else {
continue
}
guard let name = tag.attributes?[TwitterKey.name], name == TwitterValue.imageSrc else {
continue
}
guard let content = tag.attributes?[TwitterKey.content], !content.isEmpty else {
continue
}
return content
}
return nil
}()
self.imageURL = imageURL
}
}
private func urlString(from attributes: HTMLTagAttributes) -> String? {
if let urlString = attributes.object(forCaseInsensitiveKey: "href") {
return urlString
}
return attributes.object(forCaseInsensitiveKey: "src")
}
private func relValue(from attributes: HTMLTagAttributes) -> String? {
attributes.object(forCaseInsensitiveKey: "rel")
}
private func absoluteURLString(from attributes: HTMLTagAttributes, baseURL: String) -> String? {
guard let urlString = urlString(from: attributes), !urlString.isEmpty else {
return nil
}
return absoluteURLStringWithRelativeURLString(urlString, baseURLString: baseURL)
}
private func absoluteURLStringWithRelativeURLString(_ relativeURLString: String, baseURLString: String) -> String? {
guard let baseURL = URL(string: baseURLString) else {
return nil
}
guard let absoluteURL = URL(string: relativeURLString, relativeTo: baseURL) else {
return nil
}
return absoluteURL.absoluteURL.standardized.absoluteString
}

View File

@@ -0,0 +1,102 @@
//
// HTMLMetadataParser.swift
//
//
// Created by Brent Simmons on 9/22/24.
//
import Foundation
import RSCore
public final class HTMLMetadataParser {
private var tags = [HTMLTag]()
public static func metadata(with parserData: ParserData) -> HTMLMetadata {
HTMLMetadataParser().parse(parserData)
}
}
private extension HTMLMetadataParser {
func parse(_ parserData: ParserData) -> HTMLMetadata {
tags = [HTMLTag]()
let htmlParser = SAXHTMLParser(delegate: self, data: parserData.data)
htmlParser.parse()
return HTMLMetadata(parserData.url, tags)
}
}
extension HTMLMetadataParser: SAXHTMLParserDelegate {
private struct HTMLName {
static let link = "link".utf8CString
static let meta = "meta".utf8CString
}
private struct HTMLKey {
static let href = "href"
static let src = "src"
static let rel = "rel"
}
private func link(with attributes: StringDictionary) -> String? {
if let link = attributes.object(forCaseInsensitiveKey: HTMLKey.href) {
return link
}
return attributes.object(forCaseInsensitiveKey: HTMLKey.src)
}
private func handleLinkAttributes(_ attributes: StringDictionary) {
guard let rel = attributes.object(forCaseInsensitiveKey: HTMLKey.rel), !rel.isEmpty else {
return
}
guard let link = link(with: attributes), !link.isEmpty else {
return
}
let tag = HTMLTag(tagType: .link, attributes: attributes)
tags.append(tag)
}
private func handleMetaAttributes(_ attributes: StringDictionary) {
let tag = HTMLTag(tagType: .meta, attributes: attributes)
tags.append(tag)
}
public func saxHTMLParser(_ saxHTMLParser: SAXHTMLParser, startElement name: XMLPointer, attributes: UnsafePointer<XMLPointer?>?) {
if SAXEqualTags(name, HTMLName.link) {
let d = saxHTMLParser.attributesDictionary(attributes)
if let d, !d.isEmpty {
handleLinkAttributes(d)
}
}
else if SAXEqualTags(name, HTMLName.meta) {
let d = saxHTMLParser.attributesDictionary(attributes)
if let d, !d.isEmpty {
handleMetaAttributes(d)
}
}
}
public func saxHTMLParser(_: SAXHTMLParser, endElement: XMLPointer) {
// Nothing to do
}
public func saxHTMLParser(_: SAXHTMLParser, charactersFound: XMLPointer, count: Int) {
// Nothing to do
}
}

View File

@@ -0,0 +1,26 @@
//
// HTMLTag.swift
//
//
// Created by Brent Simmons on 8/18/24.
//
import Foundation
public typealias HTMLTagAttributes = [String: String]
public struct HTMLTag: Sendable {
public enum TagType: Sendable {
case link
case meta
}
public let tagType: TagType
public let attributes: HTMLTagAttributes?
public init(tagType: TagType, attributes: HTMLTagAttributes?) {
self.tagType = tagType
self.attributes = attributes
}
}

View File

@@ -0,0 +1,53 @@
//
// OPMLAttributes.swift
//
//
// Created by Brent Simmons on 8/18/24.
//
import Foundation
// OPML allows for arbitrary attributes.
// These are the common attributes in OPML files used as RSS subscription lists.
private let opmlTextKey = "text"
private let opmlTitleKey = "title"
private let opmlDescriptionKey = "description"
private let opmlTypeKey = "type"
private let opmlVersionKey = "version"
private let opmlHMTLURLKey = "htmlUrl"
private let opmlXMLURLKey = "xmlUrl"
// A frequent error in OPML files is to mess up the capitalization,
// so these do a case-insensitive lookup.
extension Dictionary where Key == String, Value == String {
var opml_text: String? {
object(forCaseInsensitiveKey: opmlTextKey)
}
var opml_title: String? {
object(forCaseInsensitiveKey: opmlTitleKey)
}
var opml_description: String? {
object(forCaseInsensitiveKey: opmlDescriptionKey)
}
var opml_type: String? {
object(forCaseInsensitiveKey: opmlTypeKey)
}
var opml_version: String? {
object(forCaseInsensitiveKey: opmlVersionKey)
}
var opml_htmlUrl: String? {
object(forCaseInsensitiveKey: opmlHMTLURLKey)
}
var opml_xmlUrl: String? {
object(forCaseInsensitiveKey: opmlXMLURLKey)
}
}

View File

@@ -0,0 +1,19 @@
//
// OPMLDocument.swift
//
//
// Created by Brent Simmons on 8/18/24.
//
import Foundation
public final class OPMLDocument: OPMLItem {
public var title: String? = nil
public var url: String? = nil
init(url: String?) {
self.url = url
super.init(attributes: nil)
}
}

View File

@@ -0,0 +1,40 @@
//
// OPMLFeedSpecifier.swift
//
//
// Created by Brent Simmons on 8/18/24.
//
import Foundation
public struct OPMLFeedSpecifier: Sendable {
public let title: String?
public let feedDescription: String?
public let homePageURL: String?
public let feedURL: String
init(title: String?, feedDescription: String?, homePageURL: String?, feedURL: String) {
if String.isEmptyOrNil(title) {
self.title = nil
} else {
self.title = title
}
if String.isEmptyOrNil(feedDescription) {
self.feedDescription = nil
} else {
self.feedDescription = feedDescription
}
if String.isEmptyOrNil(homePageURL) {
self.homePageURL = nil
} else {
self.homePageURL = homePageURL
}
self.feedURL = feedURL
}
}

View File

@@ -0,0 +1,42 @@
//
// OPMLItem.swift
//
//
// Created by Brent Simmons on 8/18/24.
//
import Foundation
import os
public class OPMLItem {
public let feedSpecifier: OPMLFeedSpecifier?
public let attributes: [String: String]?
public let titleFromAttributes: String?
public var items: [OPMLItem]?
public var isFolder: Bool {
(items?.count ?? 0) > 0
}
init(attributes: [String : String]?) {
self.titleFromAttributes = attributes?.opml_title ?? attributes?.opml_text
self.attributes = attributes
if let feedURL = attributes?.opml_xmlUrl {
self.feedSpecifier = OPMLFeedSpecifier(title: self.titleFromAttributes, feedDescription: attributes?.opml_description, homePageURL: attributes?.opml_htmlUrl, feedURL: feedURL)
} else {
self.feedSpecifier = nil
}
}
public func add(_ item: OPMLItem) {
if items == nil {
items = [OPMLItem]()
}
items?.append(item)
}
}

View File

@@ -0,0 +1,117 @@
//
// OPMLParser.swift
//
//
// Created by Brent Simmons on 8/18/24.
//
import Foundation
public final class OPMLParser {
private let parserData: ParserData
private var data: Data {
parserData.data
}
private var opmlDocument: OPMLDocument?
private var itemStack = [OPMLItem]()
private var currentItem: OPMLItem? {
itemStack.last
}
/// Returns nil if data cant be parsed (if its not OPML).
public static func document(with parserData: ParserData) -> OPMLDocument? {
let opmlParser = OPMLParser(parserData)
opmlParser.parse()
return opmlParser.opmlDocument
}
init(_ parserData: ParserData) {
self.parserData = parserData
}
}
private extension OPMLParser {
func parse() {
guard canParseData() else {
return
}
opmlDocument = OPMLDocument(url: parserData.url)
push(opmlDocument!)
let saxParser = SAXParser(delegate: self, data: data)
saxParser.parse()
}
func canParseData() -> Bool {
data.containsASCIIString("<opml")
}
func push(_ item: OPMLItem) {
itemStack.append(item)
}
func popItem() {
guard itemStack.count > 0 else {
assertionFailure("itemStack.count must be > 0")
return
}
itemStack.removeLast()
}
}
extension OPMLParser: SAXParserDelegate {
private struct XMLName {
static let title = "title".utf8CString
static let outline = "outline".utf8CString
}
public func saxParser(_ saxParser: SAXParser, xmlStartElement localName: XMLPointer, prefix: XMLPointer?, uri: XMLPointer?, namespaceCount: Int, namespaces: UnsafePointer<XMLPointer?>?, attributeCount: Int, attributesDefaultedCount: Int, attributes: UnsafePointer<XMLPointer?>?) {
if SAXEqualTags(localName, XMLName.title) {
saxParser.beginStoringCharacters()
return
}
if !SAXEqualTags(localName, XMLName.outline) {
return
}
let attributesDictionary = saxParser.attributesDictionary(attributes, attributeCount: attributeCount)
let item = OPMLItem(attributes: attributesDictionary)
currentItem?.add(item)
push(item)
}
public func saxParser(_ saxParser: SAXParser, xmlEndElement localName: XMLPointer, prefix: XMLPointer?, uri: XMLPointer?) {
if SAXEqualTags(localName, XMLName.title) {
if let item = currentItem as? OPMLDocument {
item.title = saxParser.currentStringWithTrimmedWhitespace
}
saxParser.endStoringCharacters()
return
}
if SAXEqualTags(localName, XMLName.outline) {
popItem()
}
}
public func saxParser(_: SAXParser, xmlCharactersFound: XMLPointer, count: Int) {
// Nothing to do, but method is required.
}
}

View File

@@ -0,0 +1,68 @@
//
// Data+Parser.swift
//
//
// Created by Brent Simmons on 8/24/24.
//
import Foundation
public extension Data {
/// Return true if the data contains a given String.
///
/// Assumes that the data is UTF-8 or similar encoding
/// if its UTF-16 or UTF-32, for instance, this will always return false.
/// Luckily these are rare.
///
/// The String to search for should be something that could be encoded
/// in ASCII  like "<opml" or "<rss". (In other words,
/// the sequence of characters would always be the same in
/// commonly-used encodings.)
func containsASCIIString(_ searchFor: String) -> Bool {
contains(searchFor.utf8)
}
/// Return true if searchFor appears in self.
func contains(_ searchFor: Data) -> Bool {
let searchForCount = searchFor.count
let dataCount = self.count
guard searchForCount > 0, searchForCount <= dataCount else {
return false
}
let searchForInitialByte = searchFor[0]
var found = false
self.withUnsafeBytes { bytes in
let buffer = bytes.bindMemory(to: UInt8.self)
for i in 0...dataCount - searchForCount {
if buffer[i] == searchForInitialByte {
var match = true
for j in 1..<searchForCount {
if buffer[i + j] != searchFor[j] {
match = false
break
}
}
if match {
found = true
return
}
}
}
}
return found
}
}

View File

@@ -0,0 +1,28 @@
//
// Dictionary+Parser.swift
//
//
// Created by Brent Simmons on 8/18/24.
//
import Foundation
public extension Dictionary where Key == String, Value == String {
func object(forCaseInsensitiveKey key: String) -> String? {
if let object = self[key] {
return object
}
let lowercaseKey = key.lowercased()
for (oneKey, oneValue) in self {
if lowercaseKey.caseInsensitiveCompare(oneKey) == .orderedSame {
return oneValue
}
}
return nil
}
}

View File

@@ -0,0 +1,23 @@
//
// String+Parser.swift
// Parser
//
// Created by Nate Weaver on 2020-01-19.
// Copyright © 2020 Ranchero Software, LLC. All rights reserved.
//
import Foundation
public extension String {
var nilIfEmptyOrWhitespace: String? {
return self.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty ? nil : self
}
static func isEmptyOrNil(_ s: String?) -> Bool {
if let s {
return s.isEmpty
}
return true
}
}

View File

@@ -0,0 +1,19 @@
//
// ParserData.swift
//
//
// Created by Brent Simmons on 8/18/24.
//
import Foundation
public struct ParserData: Sendable {
public let url: String
public let data: Data
public init(url: String, data: Data) {
self.url = url
self.data = data
}
}

View File

@@ -0,0 +1,200 @@
//
// SAXHTMLParser.swift
//
//
// Created by Brent Simmons on 8/26/24.
//
import Foundation
import RSCore
import libxml2
public protocol SAXHTMLParserDelegate: AnyObject {
func saxHTMLParser(_: SAXHTMLParser, startElement: XMLPointer, attributes: UnsafePointer<XMLPointer?>?)
func saxHTMLParser(_: SAXHTMLParser, endElement: XMLPointer)
// Length is guaranteed to be greater than 0.
func saxHTMLParser(_: SAXHTMLParser, charactersFound: XMLPointer, count: Int)
}
public final class SAXHTMLParser {
fileprivate let delegate: SAXHTMLParserDelegate
public var currentCharacters: Data? { // UTF-8 encoded
guard storingCharacters else {
return nil
}
return characters
}
// Conveniences to get string version of currentCharacters
public var currentString: String? {
guard let d = currentCharacters, !d.isEmpty else {
return nil
}
return String(data: d, encoding: .utf8)
}
public var currentStringWithTrimmedWhitespace: String? {
guard let s = currentString else {
return nil
}
return s.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines)
}
private var data: Data
private var storingCharacters = false
private var characters = Data()
public init(delegate: SAXHTMLParserDelegate, data: Data) {
self.delegate = delegate
self.data = data
}
public func parse() {
guard !data.isEmpty else {
return
}
data.withUnsafeBytes { bufferPointer in
guard let bytes = bufferPointer.bindMemory(to: CChar.self).baseAddress else {
return
}
let characterEncoding = xmlDetectCharEncoding(bytes, Int32(data.count))
let context = htmlCreatePushParserCtxt(&saxHandlerStruct, Unmanaged.passUnretained(self).toOpaque(), nil, 0, nil, characterEncoding)
htmlCtxtUseOptions(context, Int32(HTML_PARSE_RECOVER.rawValue | HTML_PARSE_NONET.rawValue | HTML_PARSE_COMPACT.rawValue | HTML_PARSE_NOERROR.rawValue | HTML_PARSE_NOWARNING.rawValue))
htmlParseChunk(context, bytes, Int32(data.count), 0)
htmlParseChunk(context, nil, 0, 1)
htmlFreeParserCtxt(context)
}
}
/// Delegate can call from xmlStartElement. Characters will be available in xmlEndElement as currentCharacters property. Storing characters is stopped after each xmlEndElement.
public func beginStoringCharacters() {
storingCharacters = true
characters.count = 0
}
public func endStoringCharacters() {
storingCharacters = false
characters.count = 0
}
public func attributesDictionary(_ attributes: UnsafePointer<XMLPointer?>?) -> StringDictionary? {
guard let attributes else {
return nil
}
var dictionary = [String: String]()
var ix = 0
var currentKey: String? = nil
while true {
let oneAttribute = attributes[ix]
ix += 1
if currentKey == nil && oneAttribute == nil {
break
}
if currentKey == nil {
if let oneAttribute {
currentKey = String(cString: oneAttribute)
}
} else {
let value: String?
if let oneAttribute {
value = String(cString: oneAttribute)
} else {
value = nil
}
dictionary[currentKey!] = value ?? ""
currentKey = nil
}
}
return dictionary
}
}
private extension SAXHTMLParser {
func charactersFound(_ htmlCharacters: XMLPointer, count: Int) {
if storingCharacters {
characters.append(htmlCharacters, count: count)
}
delegate.saxHTMLParser(self, charactersFound: htmlCharacters, count: count)
}
func startElement(_ name: XMLPointer, attributes: UnsafePointer<XMLPointer?>?) {
delegate.saxHTMLParser(self, startElement: name, attributes: attributes)
}
func endElement(_ name: XMLPointer) {
delegate.saxHTMLParser(self, endElement: name)
endStoringCharacters()
}
}
private func parser(from context: UnsafeMutableRawPointer) -> SAXHTMLParser {
Unmanaged<SAXHTMLParser>.fromOpaque(context).takeUnretainedValue()
}
nonisolated(unsafe) private var saxHandlerStruct: xmlSAXHandler = {
var handler = htmlSAXHandler()
handler.characters = { (context: UnsafeMutableRawPointer?, ch: XMLPointer?, len: CInt) in
guard let context, let ch, len > 0 else {
return
}
let parser = parser(from: context)
parser.charactersFound(ch, count: Int(len))
}
handler.startElement = { (context: UnsafeMutableRawPointer?, name: XMLPointer?, attributes: UnsafeMutablePointer<XMLPointer?>?) in
guard let context, let name else {
return
}
let parser = parser(from: context)
parser.startElement(name, attributes: attributes)
}
handler.endElement = { (context: UnsafeMutableRawPointer?, name: XMLPointer?) in
guard let context, let name else {
return
}
let parser = parser(from: context)
parser.endElement(name)
}
return handler
}()

View File

@@ -0,0 +1,204 @@
//
// SAXParser.swift.
//
//
// Created by Brent Simmons on 8/12/24.
//
import Foundation
import RSCore
import libxml2
public typealias XMLPointer = UnsafePointer<xmlChar>
public protocol SAXParserDelegate {
func saxParser(_: SAXParser, xmlStartElement: XMLPointer, prefix: XMLPointer?, uri: XMLPointer?, namespaceCount: Int, namespaces: UnsafePointer<XMLPointer?>?, attributeCount: Int, attributesDefaultedCount: Int, attributes: UnsafePointer<XMLPointer?>?)
func saxParser(_: SAXParser, xmlEndElement: XMLPointer, prefix: XMLPointer?, uri: XMLPointer?)
func saxParser(_: SAXParser, xmlCharactersFound: XMLPointer, count: Int)
}
public final class SAXParser {
fileprivate let delegate: SAXParserDelegate
public var currentCharacters: Data? { // UTF-8 encoded
guard storingCharacters else {
return nil
}
return characters
}
// Conveniences to get string version of currentCharacters
public var currentString: String? {
guard let d = currentCharacters, !d.isEmpty else {
return nil
}
return String(data: d, encoding: .utf8)
}
public var currentStringWithTrimmedWhitespace: String? {
guard let s = currentString else {
return nil
}
return s.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines)
}
private var data: Data
private var storingCharacters = false
private var characters = Data()
public init(delegate: SAXParserDelegate, data: Data) {
self.delegate = delegate
self.data = data
}
public func parse() {
guard !data.isEmpty else {
return
}
let context = xmlCreatePushParserCtxt(&saxHandlerStruct, Unmanaged.passUnretained(self).toOpaque(), nil, 0, nil)
xmlCtxtUseOptions(context, Int32(XML_PARSE_RECOVER.rawValue | XML_PARSE_NOENT.rawValue))
data.withUnsafeBytes { bufferPointer in
if let bytes = bufferPointer.bindMemory(to: CChar.self).baseAddress {
xmlParseChunk(context, bytes, Int32(data.count), 0)
}
}
xmlParseChunk(context, nil, 0, 1)
xmlFreeParserCtxt(context)
}
/// Delegate can call from xmlStartElement. Characters will be available in xmlEndElement as currentCharacters property. Storing characters is stopped after each xmlEndElement.
public func beginStoringCharacters() {
storingCharacters = true
characters.count = 0
}
public func endStoringCharacters() {
storingCharacters = false
characters.count = 0
}
public func attributesDictionary(_ attributes: UnsafePointer<XMLPointer?>?, attributeCount: Int) -> StringDictionary? {
guard attributeCount > 0, let attributes else {
return nil
}
var dictionary = [String: String]()
let fieldCount = 5
var i = 0, j = 0
while i < attributeCount {
guard let attribute = attributes[j] else {
continue
}
let prefix = attributes[j + 1]
var attributeName = String(cString: attribute)
if let prefix {
let attributePrefix = String(cString: prefix)
attributeName = "\(attributePrefix):\(attributeName)"
}
guard let valueStart = attributes[j + 3], let valueEnd = attributes[j + 4] else {
continue
}
let valueCount = valueEnd - valueStart
let value = String(bytes: UnsafeRawBufferPointer(start: valueStart, count: Int(valueCount)), encoding: .utf8)
if let value {
dictionary[attributeName] = value
}
i += 1
j += fieldCount
}
return dictionary
}
}
private extension SAXParser {
func charactersFound(_ xmlCharacters: XMLPointer, count: Int) {
if storingCharacters {
characters.append(xmlCharacters, count: count)
}
delegate.saxParser(self, xmlCharactersFound: xmlCharacters, count: count)
}
func startElement(_ name: XMLPointer, prefix: XMLPointer?, uri: XMLPointer?, namespaceCount: Int, namespaces: UnsafePointer<XMLPointer?>?, attributeCount: Int, attributesDefaultedCount: Int, attributes: UnsafePointer<XMLPointer?>?) {
delegate.saxParser(self, xmlStartElement: name, prefix: prefix, uri: uri, namespaceCount: namespaceCount, namespaces: namespaces, attributeCount: attributeCount, attributesDefaultedCount: attributesDefaultedCount, attributes: attributes)
}
func endElement(_ name: XMLPointer, prefix: XMLPointer?, uri: XMLPointer?) {
delegate.saxParser(self, xmlEndElement: name, prefix: prefix, uri: uri)
endStoringCharacters()
}
}
private func startElement(_ context: UnsafeMutableRawPointer?, name: XMLPointer?, prefix: XMLPointer?, URI: XMLPointer?, nb_namespaces: CInt, namespaces: UnsafeMutablePointer<XMLPointer?>?, nb_attributes: CInt, nb_defaulted: CInt, attributes: UnsafeMutablePointer<XMLPointer?>?) {
guard let context, let name else {
return
}
let parser = parser(from: context)
parser.startElement(name, prefix: prefix, uri: URI, namespaceCount: Int(nb_namespaces), namespaces: namespaces, attributeCount: Int(nb_attributes), attributesDefaultedCount: Int(nb_defaulted), attributes: attributes)
}
private func endElement(_ context: UnsafeMutableRawPointer?, name: XMLPointer?, prefix: XMLPointer?, URI: XMLPointer?) {
guard let context, let name else {
return
}
let parser = parser(from: context)
parser.endElement(name, prefix: prefix, uri: URI)
}
private func charactersFound(_ context: UnsafeMutableRawPointer?, ch: XMLPointer?, len: CInt) {
guard let context, let ch, len > 0 else {
return
}
let parser = parser(from: context)
parser.charactersFound(ch, count: Int(len))
}
private func parser(from context: UnsafeMutableRawPointer) -> SAXParser {
Unmanaged<SAXParser>.fromOpaque(context).takeUnretainedValue()
}
nonisolated(unsafe) private var saxHandlerStruct: xmlSAXHandler = {
var handler = xmlSAXHandler()
handler.characters = charactersFound
handler.startElementNs = startElement
handler.endElementNs = endElement
handler.initialized = XML_SAX2_MAGIC
return handler
}()

View File

@@ -0,0 +1,41 @@
//
// SAXUtilities.swift
//
//
// Created by Brent Simmons on 8/26/24.
//
import Foundation
import libxml2
public func SAXEqualTags(_ localName: XMLPointer, _ tag: ContiguousArray<Int8>) -> Bool {
return tag.withUnsafeBufferPointer { bufferPointer in
let tagCount = tag.count // includes 0 terminator
for i in 0..<tagCount - 1 {
let localNameCharacter = localName[i]
if localNameCharacter == 0 {
return false
}
let tagCharacter = UInt8(tag[i])
if localNameCharacter != tagCharacter {
return false
}
}
// localName might actually be longer  make sure its the same length as tag.
return localName[tagCount - 1] == 0
}
}
public extension String {
init?(xmlPointer: XMLPointer, count: Int? = nil) {
let d = Data(bytes: xmlPointer, count: count ?? strlen(xmlPointer))
self.init(data: d, encoding: .utf8)
}
}