From 24e7eb90f6da02ed0aed28c08a98df2300e4bda8 Mon Sep 17 00:00:00 2001 From: Brent Simmons Date: Mon, 9 Sep 2024 20:54:42 -0700 Subject: [PATCH] Continue progress on porting feed parsers. --- ...78BB49A7-AEB4-40A1-83DA-EB9C5755E396.plist | 10 + .../Sources/FeedParser/Feeds/FeedParser.swift | 124 ++--- .../Sources/FeedParser/Feeds/FeedType.swift | 82 +-- .../Feeds/JSON/JSONFeedParser.swift | 490 +++++++++--------- .../Feeds/JSON/RSSInJSONParser.swift | 358 ++++++------- .../Sources/FeedParser/Feeds/ParsedItem.swift | 5 + .../FeedParser/Feeds/XML/AtomParser.swift | 20 +- .../Feeds/XML/RSParsedFeedTransformer.swift | 124 ++--- .../FeedParser/Feeds/XML/RSSParser.swift | 203 ++++++-- Modules/Parser/Sources/SAX/SAXUtilities.swift | 2 +- 10 files changed, 789 insertions(+), 629 deletions(-) diff --git a/Modules/Parser/.swiftpm/xcode/xcshareddata/xcbaselines/DateParserTests.xcbaseline/78BB49A7-AEB4-40A1-83DA-EB9C5755E396.plist b/Modules/Parser/.swiftpm/xcode/xcshareddata/xcbaselines/DateParserTests.xcbaseline/78BB49A7-AEB4-40A1-83DA-EB9C5755E396.plist index d7dbb6cb7..d2b13bd80 100644 --- a/Modules/Parser/.swiftpm/xcode/xcshareddata/xcbaselines/DateParserTests.xcbaseline/78BB49A7-AEB4-40A1-83DA-EB9C5755E396.plist +++ b/Modules/Parser/.swiftpm/xcode/xcshareddata/xcbaselines/DateParserTests.xcbaseline/78BB49A7-AEB4-40A1-83DA-EB9C5755E396.plist @@ -6,6 +6,16 @@ DateParserTests + testPubDateParsingPerformance() + + com.apple.XCTPerformanceMetric_WallClockTime + + baselineAverage + 0.000131 + baselineIntegrationDisplayName + Local Baseline + + testW3CParsingPerformance() com.apple.XCTPerformanceMetric_WallClockTime diff --git a/Modules/Parser/Sources/FeedParser/Feeds/FeedParser.swift b/Modules/Parser/Sources/FeedParser/Feeds/FeedParser.swift index d1e78b464..64f6e5e97 100644 --- a/Modules/Parser/Sources/FeedParser/Feeds/FeedParser.swift +++ b/Modules/Parser/Sources/FeedParser/Feeds/FeedParser.swift @@ -12,65 +12,65 @@ import SAX // FeedParser handles RSS, Atom, JSON Feed, and RSS-in-JSON. // You don’t need to know the type of feed. -public struct FeedParser { - - public static func canParse(_ parserData: ParserData) -> Bool { - - let type = feedType(parserData) - - switch type { - case .jsonFeed, .rssInJSON, .rss, .atom: - return true - default: - return false - } - } - - public static func parse(_ parserData: ParserData) async throws -> ParsedFeed? { - - let type = feedType(parserData) - - switch type { - - case .jsonFeed: - return try JSONFeedParser.parse(parserData) - - case .rssInJSON: - return try RSSInJSONParser.parse(parserData) - - case .rss: - return RSSParser.parse(parserData) - - case .atom: - return AtomParser.parse(parserData) - - case .unknown, .notAFeed: - return nil - } - } - - /// For unit tests measuring performance. - public static func parseSync(_ parserData: ParserData) throws -> ParsedFeed? { - - let type = feedType(parserData) - - switch type { - - case .jsonFeed: - return try JSONFeedParser.parse(parserData) - - case .rssInJSON: - return try RSSInJSONParser.parse(parserData) - - case .rss: - return RSSParser.parse(parserData) - - case .atom: - return AtomParser.parse(parserData) - - case .unknown, .notAFeed: - return nil - } - } - -} +//public struct FeedParser { +// +// public static func canParse(_ parserData: ParserData) -> Bool { +// +// let type = feedType(parserData) +// +// switch type { +// case .jsonFeed, .rssInJSON, .rss, .atom: +// return true +// default: +// return false +// } +// } +// +// public static func parse(_ parserData: ParserData) async throws -> ParsedFeed? { +// +// let type = feedType(parserData) +// +// switch type { +// +// case .jsonFeed: +// return try JSONFeedParser.parse(parserData) +// +// case .rssInJSON: +// return try RSSInJSONParser.parse(parserData) +// +// case .rss: +// return RSSParser.parse(parserData) +// +// case .atom: +// return AtomParser.parse(parserData) +// +// case .unknown, .notAFeed: +// return nil +// } +// } +// +// /// For unit tests measuring performance. +// public static func parseSync(_ parserData: ParserData) throws -> ParsedFeed? { +// +// let type = feedType(parserData) +// +// switch type { +// +// case .jsonFeed: +// return try JSONFeedParser.parse(parserData) +// +// case .rssInJSON: +// return try RSSInJSONParser.parse(parserData) +// +// case .rss: +// return RSSParser.parse(parserData) +// +// case .atom: +// return AtomParser.parse(parserData) +// +// case .unknown, .notAFeed: +// return nil +// } +// } +// +//} diff --git a/Modules/Parser/Sources/FeedParser/Feeds/FeedType.swift b/Modules/Parser/Sources/FeedParser/Feeds/FeedType.swift index 8cf33225a..f934c8920 100644 --- a/Modules/Parser/Sources/FeedParser/Feeds/FeedType.swift +++ b/Modules/Parser/Sources/FeedParser/Feeds/FeedType.swift @@ -19,44 +19,44 @@ public enum FeedType: Sendable { } -private let minNumberOfBytesRequired = 128 - -public func feedType(_ parserData: ParserData, isPartialData: Bool = false) -> FeedType { - - // Can call with partial data — while still downloading, for instance. - // If there’s not enough data, return .unknown. Ask again when there’s more data. - // If it’s definitely not a feed, return .notAFeed. - // - // This is fast enough to call on the main thread. - - if parserData.data.count < minNumberOfBytesRequired { - return .unknown - } - - let nsdata = parserData.data as NSData - - if nsdata.isProbablyJSONFeed() { - return .jsonFeed - } - if nsdata.isProbablyRSSInJSON() { - return .rssInJSON - } - if nsdata.isProbablyRSS() { - return .rss - } - if nsdata.isProbablyAtom() { - return .atom - } - - if isPartialData && nsdata.isProbablyJSON() { - // Might not be able to detect a JSON Feed without all data. - // Dr. Drang’s JSON Feed (see althis.json and allthis-partial.json in tests) - // has, at this writing, the JSON version element at the end of the feed, - // which is totally legal — but it means not being able to detect - // that it’s a JSON Feed without all the data. - // So this returns .unknown instead of .notAFeed. - return .unknown - } - - return .notAFeed -} +//private let minNumberOfBytesRequired = 128 +// +//public func feedType(_ parserData: ParserData, isPartialData: Bool = false) -> FeedType { +// +// // Can call with partial data — while still downloading, for instance. +// // If there’s not enough data, return .unknown. Ask again when there’s more data. +// // If it’s definitely not a feed, return .notAFeed. +// // +// // This is fast enough to call on the main thread. +// +// if parserData.data.count < minNumberOfBytesRequired { +// return .unknown +// } +// +// let nsdata = parserData.data as NSData +// +// if nsdata.isProbablyJSONFeed() { +// return .jsonFeed +// } +// if nsdata.isProbablyRSSInJSON() { +// return .rssInJSON +// } +// if nsdata.isProbablyRSS() { +// return .rss +// } +// if nsdata.isProbablyAtom() { +// return .atom +// } +// +// if isPartialData && nsdata.isProbablyJSON() { +// // Might not be able to detect a JSON Feed without all data. +// // Dr. Drang’s JSON Feed (see althis.json and allthis-partial.json in tests) +// // has, at this writing, the JSON version element at the end of the feed, +// // which is totally legal — but it means not being able to detect +// // that it’s a JSON Feed without all the data. +// // So this returns .unknown instead of .notAFeed. +// return .unknown +// } +// +// return .notAFeed +//} diff --git a/Modules/Parser/Sources/FeedParser/Feeds/JSON/JSONFeedParser.swift b/Modules/Parser/Sources/FeedParser/Feeds/JSON/JSONFeedParser.swift index 247c18612..723ec1afb 100644 --- a/Modules/Parser/Sources/FeedParser/Feeds/JSON/JSONFeedParser.swift +++ b/Modules/Parser/Sources/FeedParser/Feeds/JSON/JSONFeedParser.swift @@ -1,248 +1,248 @@ +//// +//// JSONFeedParser.swift +//// RSParser +//// +//// Created by Brent Simmons on 6/25/17. +//// Copyright © 2017 Ranchero Software, LLC. All rights reserved. +//// // -// JSONFeedParser.swift -// RSParser +//import Foundation +//import SAX // -// Created by Brent Simmons on 6/25/17. -// Copyright © 2017 Ranchero Software, LLC. All rights reserved. +//// See https://jsonfeed.org/version/1.1 // - -import Foundation -import SAX - -// See https://jsonfeed.org/version/1.1 - -public struct JSONFeedParser { - - struct Key { - static let version = "version" - static let items = "items" - static let title = "title" - static let homePageURL = "home_page_url" - static let feedURL = "feed_url" - static let feedDescription = "description" - static let nextURL = "next_url" - static let icon = "icon" - static let favicon = "favicon" - static let expired = "expired" - static let author = "author" - static let authors = "authors" - static let name = "name" - static let url = "url" - static let avatar = "avatar" - static let hubs = "hubs" - static let type = "type" - static let contentHTML = "content_html" - static let contentText = "content_text" - static let externalURL = "external_url" - static let summary = "summary" - static let image = "image" - static let bannerImage = "banner_image" - static let datePublished = "date_published" - static let dateModified = "date_modified" - static let tags = "tags" - static let uniqueID = "id" - static let attachments = "attachments" - static let mimeType = "mime_type" - static let sizeInBytes = "size_in_bytes" - static let durationInSeconds = "duration_in_seconds" - static let language = "language" - } - - static let jsonFeedVersionMarker = "://jsonfeed.org/version/" // Allow for the mistake of not getting the scheme exactly correct. - - public static func parse(_ parserData: ParserData) throws -> ParsedFeed? { - - guard let d = JSONUtilities.dictionary(with: parserData.data) else { - throw FeedParserError(.invalidJSON) - } - - guard let version = d[Key.version] as? String, let _ = version.range(of: JSONFeedParser.jsonFeedVersionMarker) else { - throw FeedParserError(.jsonFeedVersionNotFound) - } - guard let itemsArray = d[Key.items] as? JSONArray else { - throw FeedParserError(.jsonFeedItemsNotFound) - } - guard let title = d[Key.title] as? String else { - throw FeedParserError(.jsonFeedTitleNotFound) - } - - let authors = parseAuthors(d) - let homePageURL = d[Key.homePageURL] as? String - let feedURL = d[Key.feedURL] as? String ?? parserData.url - let feedDescription = d[Key.feedDescription] as? String - let nextURL = d[Key.nextURL] as? String - let iconURL = d[Key.icon] as? String - let faviconURL = d[Key.favicon] as? String - let expired = d[Key.expired] as? Bool ?? false - let hubs = parseHubs(d) - let language = d[Key.language] as? String - - let items = parseItems(itemsArray, parserData.url) - - return ParsedFeed(type: .jsonFeed, title: title, homePageURL: homePageURL, feedURL: feedURL, language: language, feedDescription: feedDescription, nextURL: nextURL, iconURL: iconURL, faviconURL: faviconURL, authors: authors, expired: expired, hubs: hubs, items: items) - } -} - -private extension JSONFeedParser { - - static func parseAuthors(_ dictionary: JSONDictionary) -> Set? { - - if let authorsArray = dictionary[Key.authors] as? JSONArray { - var authors = Set() - for author in authorsArray { - if let parsedAuthor = parseAuthor(author) { - authors.insert(parsedAuthor) - } - } - return authors - } - - guard let authorDictionary = dictionary[Key.author] as? JSONDictionary, - let parsedAuthor = parseAuthor(authorDictionary) else { - return nil - } - - return Set([parsedAuthor]) - } - - static func parseAuthor(_ dictionary: JSONDictionary) -> ParsedAuthor? { - let name = dictionary[Key.name] as? String - let url = dictionary[Key.url] as? String - let avatar = dictionary[Key.avatar] as? String - if name == nil && url == nil && avatar == nil { - return nil - } - return ParsedAuthor(name: name, url: url, avatarURL: avatar, emailAddress: nil) - } - - static func parseHubs(_ dictionary: JSONDictionary) -> Set? { - - guard let hubsArray = dictionary[Key.hubs] as? JSONArray else { - return nil - } - - let hubs = hubsArray.compactMap { (hubDictionary) -> ParsedHub? in - guard let hubURL = hubDictionary[Key.url] as? String, let hubType = hubDictionary[Key.type] as? String else { - return nil - } - return ParsedHub(type: hubType, url: hubURL) - } - return hubs.isEmpty ? nil : Set(hubs) - } - - static func parseItems(_ itemsArray: JSONArray, _ feedURL: String) -> Set { - - return Set(itemsArray.compactMap { (oneItemDictionary) -> ParsedItem? in - return parseItem(oneItemDictionary, feedURL) - }) - } - - static func parseItem(_ itemDictionary: JSONDictionary, _ feedURL: String) -> ParsedItem? { - - guard let uniqueID = parseUniqueID(itemDictionary) else { - return nil - } - - let contentHTML = itemDictionary[Key.contentHTML] as? String - let contentText = itemDictionary[Key.contentText] as? String - if contentHTML == nil && contentText == nil { - return nil - } - - let url = itemDictionary[Key.url] as? String - let externalURL = itemDictionary[Key.externalURL] as? String - let title = parseTitle(itemDictionary, feedURL) - let language = itemDictionary[Key.language] as? String - let summary = itemDictionary[Key.summary] as? String - let imageURL = itemDictionary[Key.image] as? String - let bannerImageURL = itemDictionary[Key.bannerImage] as? String - - let datePublished = parseDate(itemDictionary[Key.datePublished] as? String) - let dateModified = parseDate(itemDictionary[Key.dateModified] as? String) - - let authors = parseAuthors(itemDictionary) - var tags: Set? = nil - if let tagsArray = itemDictionary[Key.tags] as? [String] { - tags = Set(tagsArray) - } - let attachments = parseAttachments(itemDictionary) - - return ParsedItem(syncServiceID: nil, uniqueID: uniqueID, feedURL: feedURL, url: url, externalURL: externalURL, title: title, language: language, contentHTML: contentHTML, contentText: contentText, summary: summary, imageURL: imageURL, bannerImageURL: bannerImageURL, datePublished: datePublished, dateModified: dateModified, authors: authors, tags: tags, attachments: attachments) - } - - static func parseTitle(_ itemDictionary: JSONDictionary, _ feedURL: String) -> String? { - - guard let title = itemDictionary[Key.title] as? String else { - return nil - } - - if isSpecialCaseTitleWithEntitiesFeed(feedURL) { - return (title as NSString).rsparser_stringByDecodingHTMLEntities() - } - - return title - } - - static func isSpecialCaseTitleWithEntitiesFeed(_ feedURL: String) -> Bool { - - // As of 16 Feb. 2018, Kottke’s and Heer’s feeds includes HTML entities in the title elements. - // If we find more feeds like this, we’ll add them here. If these feeds get fixed, we’ll remove them. - - let lowerFeedURL = feedURL.lowercased() - let matchStrings = ["kottke.org", "pxlnv.com", "macstories.net", "macobserver.com"] - for matchString in matchStrings { - if lowerFeedURL.contains(matchString) { - return true - } - } - - return false - } - - static func parseUniqueID(_ itemDictionary: JSONDictionary) -> String? { - - if let uniqueID = itemDictionary[Key.uniqueID] as? String { - return uniqueID // Spec says it must be a string - } - // Version 1 spec also says that if it’s a number, even though that’s incorrect, it should be coerced to a string. - if let uniqueID = itemDictionary[Key.uniqueID] as? Int { - return "\(uniqueID)" - } - if let uniqueID = itemDictionary[Key.uniqueID] as? Double { - return "\(uniqueID)" - } - return nil - } - - static func parseDate(_ dateString: String?) -> Date? { - - guard let dateString = dateString, !dateString.isEmpty else { - return nil - } - return RSDateWithString(dateString) - } - - static func parseAttachments(_ itemDictionary: JSONDictionary) -> Set? { - - guard let attachmentsArray = itemDictionary[Key.attachments] as? JSONArray else { - return nil - } - return Set(attachmentsArray.compactMap { parseAttachment($0) }) - } - - static func parseAttachment(_ attachmentObject: JSONDictionary) -> ParsedAttachment? { - - guard let url = attachmentObject[Key.url] as? String else { - return nil - } - guard let mimeType = attachmentObject[Key.mimeType] as? String else { - return nil - } - - let title = attachmentObject[Key.title] as? String - let sizeInBytes = attachmentObject[Key.sizeInBytes] as? Int - let durationInSeconds = attachmentObject[Key.durationInSeconds] as? Int - - return ParsedAttachment(url: url, mimeType: mimeType, title: title, sizeInBytes: sizeInBytes, durationInSeconds: durationInSeconds) - } -} +//public struct JSONFeedParser { +// +// struct Key { +// static let version = "version" +// static let items = "items" +// static let title = "title" +// static let homePageURL = "home_page_url" +// static let feedURL = "feed_url" +// static let feedDescription = "description" +// static let nextURL = "next_url" +// static let icon = "icon" +// static let favicon = "favicon" +// static let expired = "expired" +// static let author = "author" +// static let authors = "authors" +// static let name = "name" +// static let url = "url" +// static let avatar = "avatar" +// static let hubs = "hubs" +// static let type = "type" +// static let contentHTML = "content_html" +// static let contentText = "content_text" +// static let externalURL = "external_url" +// static let summary = "summary" +// static let image = "image" +// static let bannerImage = "banner_image" +// static let datePublished = "date_published" +// static let dateModified = "date_modified" +// static let tags = "tags" +// static let uniqueID = "id" +// static let attachments = "attachments" +// static let mimeType = "mime_type" +// static let sizeInBytes = "size_in_bytes" +// static let durationInSeconds = "duration_in_seconds" +// static let language = "language" +// } +// +// static let jsonFeedVersionMarker = "://jsonfeed.org/version/" // Allow for the mistake of not getting the scheme exactly correct. +// +// public static func parse(_ parserData: ParserData) throws -> ParsedFeed? { +// +// guard let d = JSONUtilities.dictionary(with: parserData.data) else { +// throw FeedParserError(.invalidJSON) +// } +// +// guard let version = d[Key.version] as? String, let _ = version.range(of: JSONFeedParser.jsonFeedVersionMarker) else { +// throw FeedParserError(.jsonFeedVersionNotFound) +// } +// guard let itemsArray = d[Key.items] as? JSONArray else { +// throw FeedParserError(.jsonFeedItemsNotFound) +// } +// guard let title = d[Key.title] as? String else { +// throw FeedParserError(.jsonFeedTitleNotFound) +// } +// +// let authors = parseAuthors(d) +// let homePageURL = d[Key.homePageURL] as? String +// let feedURL = d[Key.feedURL] as? String ?? parserData.url +// let feedDescription = d[Key.feedDescription] as? String +// let nextURL = d[Key.nextURL] as? String +// let iconURL = d[Key.icon] as? String +// let faviconURL = d[Key.favicon] as? String +// let expired = d[Key.expired] as? Bool ?? false +// let hubs = parseHubs(d) +// let language = d[Key.language] as? String +// +// let items = parseItems(itemsArray, parserData.url) +// +// return ParsedFeed(type: .jsonFeed, title: title, homePageURL: homePageURL, feedURL: feedURL, language: language, feedDescription: feedDescription, nextURL: nextURL, iconURL: iconURL, faviconURL: faviconURL, authors: authors, expired: expired, hubs: hubs, items: items) +// } +//} +// +//private extension JSONFeedParser { +// +// static func parseAuthors(_ dictionary: JSONDictionary) -> Set? { +// +// if let authorsArray = dictionary[Key.authors] as? JSONArray { +// var authors = Set() +// for author in authorsArray { +// if let parsedAuthor = parseAuthor(author) { +// authors.insert(parsedAuthor) +// } +// } +// return authors +// } +// +// guard let authorDictionary = dictionary[Key.author] as? JSONDictionary, +// let parsedAuthor = parseAuthor(authorDictionary) else { +// return nil +// } +// +// return Set([parsedAuthor]) +// } +// +// static func parseAuthor(_ dictionary: JSONDictionary) -> ParsedAuthor? { +// let name = dictionary[Key.name] as? String +// let url = dictionary[Key.url] as? String +// let avatar = dictionary[Key.avatar] as? String +// if name == nil && url == nil && avatar == nil { +// return nil +// } +// return ParsedAuthor(name: name, url: url, avatarURL: avatar, emailAddress: nil) +// } +// +// static func parseHubs(_ dictionary: JSONDictionary) -> Set? { +// +// guard let hubsArray = dictionary[Key.hubs] as? JSONArray else { +// return nil +// } +// +// let hubs = hubsArray.compactMap { (hubDictionary) -> ParsedHub? in +// guard let hubURL = hubDictionary[Key.url] as? String, let hubType = hubDictionary[Key.type] as? String else { +// return nil +// } +// return ParsedHub(type: hubType, url: hubURL) +// } +// return hubs.isEmpty ? nil : Set(hubs) +// } +// +// static func parseItems(_ itemsArray: JSONArray, _ feedURL: String) -> Set { +// +// return Set(itemsArray.compactMap { (oneItemDictionary) -> ParsedItem? in +// return parseItem(oneItemDictionary, feedURL) +// }) +// } +// +// static func parseItem(_ itemDictionary: JSONDictionary, _ feedURL: String) -> ParsedItem? { +// +// guard let uniqueID = parseUniqueID(itemDictionary) else { +// return nil +// } +// +// let contentHTML = itemDictionary[Key.contentHTML] as? String +// let contentText = itemDictionary[Key.contentText] as? String +// if contentHTML == nil && contentText == nil { +// return nil +// } +// +// let url = itemDictionary[Key.url] as? String +// let externalURL = itemDictionary[Key.externalURL] as? String +// let title = parseTitle(itemDictionary, feedURL) +// let language = itemDictionary[Key.language] as? String +// let summary = itemDictionary[Key.summary] as? String +// let imageURL = itemDictionary[Key.image] as? String +// let bannerImageURL = itemDictionary[Key.bannerImage] as? String +// +// let datePublished = parseDate(itemDictionary[Key.datePublished] as? String) +// let dateModified = parseDate(itemDictionary[Key.dateModified] as? String) +// +// let authors = parseAuthors(itemDictionary) +// var tags: Set? = nil +// if let tagsArray = itemDictionary[Key.tags] as? [String] { +// tags = Set(tagsArray) +// } +// let attachments = parseAttachments(itemDictionary) +// +// return ParsedItem(syncServiceID: nil, uniqueID: uniqueID, feedURL: feedURL, url: url, externalURL: externalURL, title: title, language: language, contentHTML: contentHTML, contentText: contentText, summary: summary, imageURL: imageURL, bannerImageURL: bannerImageURL, datePublished: datePublished, dateModified: dateModified, authors: authors, tags: tags, attachments: attachments) +// } +// +// static func parseTitle(_ itemDictionary: JSONDictionary, _ feedURL: String) -> String? { +// +// guard let title = itemDictionary[Key.title] as? String else { +// return nil +// } +// +// if isSpecialCaseTitleWithEntitiesFeed(feedURL) { +// return (title as NSString).rsparser_stringByDecodingHTMLEntities() +// } +// +// return title +// } +// +// static func isSpecialCaseTitleWithEntitiesFeed(_ feedURL: String) -> Bool { +// +// // As of 16 Feb. 2018, Kottke’s and Heer’s feeds includes HTML entities in the title elements. +// // If we find more feeds like this, we’ll add them here. If these feeds get fixed, we’ll remove them. +// +// let lowerFeedURL = feedURL.lowercased() +// let matchStrings = ["kottke.org", "pxlnv.com", "macstories.net", "macobserver.com"] +// for matchString in matchStrings { +// if lowerFeedURL.contains(matchString) { +// return true +// } +// } +// +// return false +// } +// +// static func parseUniqueID(_ itemDictionary: JSONDictionary) -> String? { +// +// if let uniqueID = itemDictionary[Key.uniqueID] as? String { +// return uniqueID // Spec says it must be a string +// } +// // Version 1 spec also says that if it’s a number, even though that’s incorrect, it should be coerced to a string. +// if let uniqueID = itemDictionary[Key.uniqueID] as? Int { +// return "\(uniqueID)" +// } +// if let uniqueID = itemDictionary[Key.uniqueID] as? Double { +// return "\(uniqueID)" +// } +// return nil +// } +// +// static func parseDate(_ dateString: String?) -> Date? { +// +// guard let dateString = dateString, !dateString.isEmpty else { +// return nil +// } +// return RSDateWithString(dateString) +// } +// +// static func parseAttachments(_ itemDictionary: JSONDictionary) -> Set? { +// +// guard let attachmentsArray = itemDictionary[Key.attachments] as? JSONArray else { +// return nil +// } +// return Set(attachmentsArray.compactMap { parseAttachment($0) }) +// } +// +// static func parseAttachment(_ attachmentObject: JSONDictionary) -> ParsedAttachment? { +// +// guard let url = attachmentObject[Key.url] as? String else { +// return nil +// } +// guard let mimeType = attachmentObject[Key.mimeType] as? String else { +// return nil +// } +// +// let title = attachmentObject[Key.title] as? String +// let sizeInBytes = attachmentObject[Key.sizeInBytes] as? Int +// let durationInSeconds = attachmentObject[Key.durationInSeconds] as? Int +// +// return ParsedAttachment(url: url, mimeType: mimeType, title: title, sizeInBytes: sizeInBytes, durationInSeconds: durationInSeconds) +// } +//} diff --git a/Modules/Parser/Sources/FeedParser/Feeds/JSON/RSSInJSONParser.swift b/Modules/Parser/Sources/FeedParser/Feeds/JSON/RSSInJSONParser.swift index e27c0e629..4bf2ad624 100644 --- a/Modules/Parser/Sources/FeedParser/Feeds/JSON/RSSInJSONParser.swift +++ b/Modules/Parser/Sources/FeedParser/Feeds/JSON/RSSInJSONParser.swift @@ -1,182 +1,182 @@ +//// +//// RSSInJSONParser.swift +//// RSParser +//// +//// Created by Brent Simmons on 6/24/17. +//// Copyright © 2017 Ranchero Software, LLC. All rights reserved. +//// // -// RSSInJSONParser.swift -// RSParser +//import Foundation +//import SAX // -// Created by Brent Simmons on 6/24/17. -// Copyright © 2017 Ranchero Software, LLC. All rights reserved. +//// See https://github.com/scripting/Scripting-News/blob/master/rss-in-json/README.md +//// Also: http://cyber.harvard.edu/rss/rss.html // - -import Foundation -import SAX - -// See https://github.com/scripting/Scripting-News/blob/master/rss-in-json/README.md -// Also: http://cyber.harvard.edu/rss/rss.html - -public struct RSSInJSONParser { - - public static func parse(_ parserData: ParserData) throws -> ParsedFeed? { - - do { - guard let parsedObject = try JSONSerialization.jsonObject(with: parserData.data) as? JSONDictionary else { - throw FeedParserError(.invalidJSON) - } - guard let rssObject = parsedObject["rss"] as? JSONDictionary else { - throw FeedParserError(.rssChannelNotFound) - } - guard let channelObject = rssObject["channel"] as? JSONDictionary else { - throw FeedParserError(.rssChannelNotFound) - } - - // I’d bet money that in practice the items array won’t always appear correctly inside the channel object. - // I’d also bet that sometimes it gets called "items" instead of "item". - var itemsObject = channelObject["item"] as? JSONArray - if itemsObject == nil { - itemsObject = parsedObject["item"] as? JSONArray - } - if itemsObject == nil { - itemsObject = channelObject["items"] as? JSONArray - } - if itemsObject == nil { - itemsObject = parsedObject["items"] as? JSONArray - } - if itemsObject == nil { - throw FeedParserError(.rssItemsNotFound) - } - - let title = channelObject["title"] as? String - let homePageURL = channelObject["link"] as? String - let feedURL = parserData.url - let feedDescription = channelObject["description"] as? String - let feedLanguage = channelObject["language"] as? String - - let items = parseItems(itemsObject!, parserData.url) - - return ParsedFeed(type: .rssInJSON, title: title, homePageURL: homePageURL, feedURL: feedURL, language: feedLanguage, feedDescription: feedDescription, nextURL: nil, iconURL: nil, faviconURL: nil, authors: nil, expired: false, hubs: nil, items: items) - - } - catch { throw error } - } -} - -private extension RSSInJSONParser { - - static func parseItems(_ itemsObject: JSONArray, _ feedURL: String) -> Set { - - return Set(itemsObject.compactMap{ (oneItemDictionary) -> ParsedItem? in - - return parsedItemWithDictionary(oneItemDictionary, feedURL) - }) - } - - static func parsedItemWithDictionary(_ itemDictionary: JSONDictionary, _ feedURL: String) -> ParsedItem? { - - let externalURL = itemDictionary["link"] as? String - let title = itemDictionary["title"] as? String - - var contentHTML = itemDictionary["description"] as? String - var contentText: String? = nil - if contentHTML != nil && !(contentHTML!.contains("<")) { - contentText = contentHTML - contentHTML = nil - } - if contentHTML == nil && contentText == nil && title == nil { - return nil - } - - var datePublished: Date? = nil - if let datePublishedString = itemDictionary["pubDate"] as? String { - datePublished = RSDateWithString(datePublishedString) - } - - let authors = parseAuthors(itemDictionary) - let tags = parseTags(itemDictionary) - let attachments = parseAttachments(itemDictionary) - - var uniqueID: String? = itemDictionary["guid"] as? String - if uniqueID == nil { - - // Calculate a uniqueID based on a combination of non-empty elements. Then hash the result. - // Items should have guids. When they don't, re-runs are very likely - // because there's no other 100% reliable way to determine identity. - // This calculated uniqueID is valid only for this particular feed. (Just like ids in JSON Feed.) - - var s = "" - if let datePublished = datePublished { - s += "\(datePublished.timeIntervalSince1970)" - } - if let title = title { - s += title - } - if let externalURL = externalURL { - s += externalURL - } - if let authorEmailAddress = authors?.first?.emailAddress { - s += authorEmailAddress - } - if let oneAttachmentURL = attachments?.first?.url { - s += oneAttachmentURL - } - if s.isEmpty { - // Sheesh. Tough case. - if let _ = contentHTML { - s = contentHTML! - } - if let _ = contentText { - s = contentText! - } - } - uniqueID = (s as NSString).rsparser_md5Hash() - } - - if let uniqueID = uniqueID { - return ParsedItem(syncServiceID: nil, uniqueID: uniqueID, feedURL: feedURL, url: nil, externalURL: externalURL, title: title, language: nil, contentHTML: contentHTML, contentText: contentText, summary: nil, imageURL: nil, bannerImageURL: nil, datePublished: datePublished, dateModified: nil, authors: authors, tags: tags, attachments: attachments) - } - return nil - } - - static func parseAuthors(_ itemDictionary: JSONDictionary) -> Set? { - - guard let authorEmailAddress = itemDictionary["author"] as? String else { - return nil - } - let parsedAuthor = ParsedAuthor(name: nil, url: nil, avatarURL: nil, emailAddress: authorEmailAddress) - return Set([parsedAuthor]) - } - - static func parseTags(_ itemDictionary: JSONDictionary) -> Set? { - - if let categoryObject = itemDictionary["category"] as? JSONDictionary { - if let oneTag = categoryObject["#value"] as? String { - return Set([oneTag]) - } - return nil - } - else if let categoryArray = itemDictionary["category"] as? JSONArray { - return Set(categoryArray.compactMap{ $0["#value"] as? String }) - } - return nil - } - - static func parseAttachments(_ itemDictionary: JSONDictionary) -> Set? { - - guard let enclosureObject = itemDictionary["enclosure"] as? JSONDictionary else { - return nil - } - guard let attachmentURL = enclosureObject["url"] as? String else { - return nil - } - - var attachmentSize = enclosureObject["length"] as? Int - if attachmentSize == nil { - if let attachmentSizeString = enclosureObject["length"] as? String { - attachmentSize = (attachmentSizeString as NSString).integerValue - } - } - - let type = enclosureObject["type"] as? String - if let attachment = ParsedAttachment(url: attachmentURL, mimeType: type, title: nil, sizeInBytes: attachmentSize, durationInSeconds: nil) { - return Set([attachment]) - } - return nil - } -} +//public struct RSSInJSONParser { +// +// public static func parse(_ parserData: ParserData) throws -> ParsedFeed? { +// +// do { +// guard let parsedObject = try JSONSerialization.jsonObject(with: parserData.data) as? JSONDictionary else { +// throw FeedParserError(.invalidJSON) +// } +// guard let rssObject = parsedObject["rss"] as? JSONDictionary else { +// throw FeedParserError(.rssChannelNotFound) +// } +// guard let channelObject = rssObject["channel"] as? JSONDictionary else { +// throw FeedParserError(.rssChannelNotFound) +// } +// +// // I’d bet money that in practice the items array won’t always appear correctly inside the channel object. +// // I’d also bet that sometimes it gets called "items" instead of "item". +// var itemsObject = channelObject["item"] as? JSONArray +// if itemsObject == nil { +// itemsObject = parsedObject["item"] as? JSONArray +// } +// if itemsObject == nil { +// itemsObject = channelObject["items"] as? JSONArray +// } +// if itemsObject == nil { +// itemsObject = parsedObject["items"] as? JSONArray +// } +// if itemsObject == nil { +// throw FeedParserError(.rssItemsNotFound) +// } +// +// let title = channelObject["title"] as? String +// let homePageURL = channelObject["link"] as? String +// let feedURL = parserData.url +// let feedDescription = channelObject["description"] as? String +// let feedLanguage = channelObject["language"] as? String +// +// let items = parseItems(itemsObject!, parserData.url) +// +// return ParsedFeed(type: .rssInJSON, title: title, homePageURL: homePageURL, feedURL: feedURL, language: feedLanguage, feedDescription: feedDescription, nextURL: nil, iconURL: nil, faviconURL: nil, authors: nil, expired: false, hubs: nil, items: items) +// +// } +// catch { throw error } +// } +//} +// +//private extension RSSInJSONParser { +// +// static func parseItems(_ itemsObject: JSONArray, _ feedURL: String) -> Set { +// +// return Set(itemsObject.compactMap{ (oneItemDictionary) -> ParsedItem? in +// +// return parsedItemWithDictionary(oneItemDictionary, feedURL) +// }) +// } +// +// static func parsedItemWithDictionary(_ itemDictionary: JSONDictionary, _ feedURL: String) -> ParsedItem? { +// +// let externalURL = itemDictionary["link"] as? String +// let title = itemDictionary["title"] as? String +// +// var contentHTML = itemDictionary["description"] as? String +// var contentText: String? = nil +// if contentHTML != nil && !(contentHTML!.contains("<")) { +// contentText = contentHTML +// contentHTML = nil +// } +// if contentHTML == nil && contentText == nil && title == nil { +// return nil +// } +// +// var datePublished: Date? = nil +// if let datePublishedString = itemDictionary["pubDate"] as? String { +// datePublished = RSDateWithString(datePublishedString) +// } +// +// let authors = parseAuthors(itemDictionary) +// let tags = parseTags(itemDictionary) +// let attachments = parseAttachments(itemDictionary) +// +// var uniqueID: String? = itemDictionary["guid"] as? String +// if uniqueID == nil { +// +// // Calculate a uniqueID based on a combination of non-empty elements. Then hash the result. +// // Items should have guids. When they don't, re-runs are very likely +// // because there's no other 100% reliable way to determine identity. +// // This calculated uniqueID is valid only for this particular feed. (Just like ids in JSON Feed.) +// +// var s = "" +// if let datePublished = datePublished { +// s += "\(datePublished.timeIntervalSince1970)" +// } +// if let title = title { +// s += title +// } +// if let externalURL = externalURL { +// s += externalURL +// } +// if let authorEmailAddress = authors?.first?.emailAddress { +// s += authorEmailAddress +// } +// if let oneAttachmentURL = attachments?.first?.url { +// s += oneAttachmentURL +// } +// if s.isEmpty { +// // Sheesh. Tough case. +// if let _ = contentHTML { +// s = contentHTML! +// } +// if let _ = contentText { +// s = contentText! +// } +// } +// uniqueID = (s as NSString).rsparser_md5Hash() +// } +// +// if let uniqueID = uniqueID { +// return ParsedItem(syncServiceID: nil, uniqueID: uniqueID, feedURL: feedURL, url: nil, externalURL: externalURL, title: title, language: nil, contentHTML: contentHTML, contentText: contentText, summary: nil, imageURL: nil, bannerImageURL: nil, datePublished: datePublished, dateModified: nil, authors: authors, tags: tags, attachments: attachments) +// } +// return nil +// } +// +// static func parseAuthors(_ itemDictionary: JSONDictionary) -> Set? { +// +// guard let authorEmailAddress = itemDictionary["author"] as? String else { +// return nil +// } +// let parsedAuthor = ParsedAuthor(name: nil, url: nil, avatarURL: nil, emailAddress: authorEmailAddress) +// return Set([parsedAuthor]) +// } +// +// static func parseTags(_ itemDictionary: JSONDictionary) -> Set? { +// +// if let categoryObject = itemDictionary["category"] as? JSONDictionary { +// if let oneTag = categoryObject["#value"] as? String { +// return Set([oneTag]) +// } +// return nil +// } +// else if let categoryArray = itemDictionary["category"] as? JSONArray { +// return Set(categoryArray.compactMap{ $0["#value"] as? String }) +// } +// return nil +// } +// +// static func parseAttachments(_ itemDictionary: JSONDictionary) -> Set? { +// +// guard let enclosureObject = itemDictionary["enclosure"] as? JSONDictionary else { +// return nil +// } +// guard let attachmentURL = enclosureObject["url"] as? String else { +// return nil +// } +// +// var attachmentSize = enclosureObject["length"] as? Int +// if attachmentSize == nil { +// if let attachmentSizeString = enclosureObject["length"] as? String { +// attachmentSize = (attachmentSizeString as NSString).integerValue +// } +// } +// +// let type = enclosureObject["type"] as? String +// if let attachment = ParsedAttachment(url: attachmentURL, mimeType: type, title: nil, sizeInBytes: attachmentSize, durationInSeconds: nil) { +// return Set([attachment]) +// } +// return nil +// } +//} diff --git a/Modules/Parser/Sources/FeedParser/Feeds/ParsedItem.swift b/Modules/Parser/Sources/FeedParser/Feeds/ParsedItem.swift index c9fc2eeb8..d158c74dd 100644 --- a/Modules/Parser/Sources/FeedParser/Feeds/ParsedItem.swift +++ b/Modules/Parser/Sources/FeedParser/Feeds/ParsedItem.swift @@ -63,5 +63,10 @@ public final class ParsedItem: Hashable, Sendable { hasher.combine(feedURL) } } + + public static func ==(lhs: ParsedItem, rhs: ParsedItem) -> Bool { + + lhs.syncServiceID == rhs.syncServiceID && lhs.uniqueID == rhs.uniqueID && lhs.feedURL == rhs.feedURL && lhs.url == rhs.url && lhs.externalURL == rhs.externalURL && lhs.title == rhs.title lhs.language == rhs.language && lhs.contentHTML == rhs.contentHTML && lhs.contentText == rhs.contentText && lhs.summary == rhs.summary && lhs.imageURL == rhs.imageURL && lhs.bannerImageURL == rhs.bannerImageURL && lhs.datePublished == rhs.datePublished && lhs.dateModified == rhs.dateModified && lhs.authors == rhs.authors && lhs.tags == rhs.tags && lhs.attachments == rhs.attachments + } } diff --git a/Modules/Parser/Sources/FeedParser/Feeds/XML/AtomParser.swift b/Modules/Parser/Sources/FeedParser/Feeds/XML/AtomParser.swift index 43fee7810..e3b4610e7 100644 --- a/Modules/Parser/Sources/FeedParser/Feeds/XML/AtomParser.swift +++ b/Modules/Parser/Sources/FeedParser/Feeds/XML/AtomParser.swift @@ -17,13 +17,13 @@ import SAX // // In general, you should see FeedParser.swift for all your feed-parsing needs. -public struct AtomParser { - - public static func parse(_ parserData: ParserData) -> ParsedFeed? { - - if let rsParsedFeed = RSAtomParser.parseFeed(with: parserData) { - return RSParsedFeedTransformer.parsedFeed(rsParsedFeed) - } - return nil - } -} +//public struct AtomParser { +// +// public static func parse(_ parserData: ParserData) -> ParsedFeed? { +// +// if let rsParsedFeed = RSAtomParser.parseFeed(with: parserData) { +// return RSParsedFeedTransformer.parsedFeed(rsParsedFeed) +// } +// return nil +// } +//} diff --git a/Modules/Parser/Sources/FeedParser/Feeds/XML/RSParsedFeedTransformer.swift b/Modules/Parser/Sources/FeedParser/Feeds/XML/RSParsedFeedTransformer.swift index c6d0b2ba6..9f3bc74ce 100644 --- a/Modules/Parser/Sources/FeedParser/Feeds/XML/RSParsedFeedTransformer.swift +++ b/Modules/Parser/Sources/FeedParser/Feeds/XML/RSParsedFeedTransformer.swift @@ -13,65 +13,65 @@ import Foundation // These functions take an RSParsedFeed and return a Swift-y ParsedFeed, // which is part of providing a single API for feed parsing. -struct RSParsedFeedTransformer { - - static func parsedFeed(_ rsParsedFeed: RSParsedFeed) -> ParsedFeed { - - let items = parsedItems(rsParsedFeed.articles) - return ParsedFeed(type: .rss, title: rsParsedFeed.title, homePageURL: rsParsedFeed.link, feedURL: rsParsedFeed.urlString, language: rsParsedFeed.language, feedDescription: nil, nextURL: nil, iconURL: nil, faviconURL: nil, authors: nil, expired: false, hubs: nil, items: items) - } -} - -private extension RSParsedFeedTransformer { - - static func parsedItems(_ parsedArticles: Set) -> Set { - - // Create Set from Set - - return Set(parsedArticles.map(parsedItem)) - } - - static func parsedItem(_ parsedArticle: RSParsedArticle) -> ParsedItem { - - let uniqueID = parsedArticle.articleID - let url = parsedArticle.permalink - let externalURL = parsedArticle.link - let title = parsedArticle.title - let language = parsedArticle.language - let contentHTML = parsedArticle.body - let datePublished = parsedArticle.datePublished - let dateModified = parsedArticle.dateModified - let authors = parsedAuthors(parsedArticle.authors) - let attachments = parsedAttachments(parsedArticle.enclosures) - - return ParsedItem(syncServiceID: nil, uniqueID: uniqueID, feedURL: parsedArticle.feedURL, url: url, externalURL: externalURL, title: title, language: language, contentHTML: contentHTML, contentText: nil, summary: nil, imageURL: nil, bannerImageURL: nil, datePublished: datePublished, dateModified: dateModified, authors: authors, tags: nil, attachments: attachments) - } - - static func parsedAuthors(_ authors: Set?) -> Set? { - - guard let authors = authors, !authors.isEmpty else { - return nil - } - - let transformedAuthors = authors.compactMap { (author) -> ParsedAuthor? in - return ParsedAuthor(name: author.name, url: author.url, avatarURL: nil, emailAddress: author.emailAddress) - } - - return transformedAuthors.isEmpty ? nil : Set(transformedAuthors) - } - - static func parsedAttachments(_ enclosures: Set?) -> Set? { - - guard let enclosures = enclosures, !enclosures.isEmpty else { - return nil - } - - let attachments = enclosures.compactMap { (enclosure) -> ParsedAttachment? in - - let sizeInBytes = enclosure.length > 0 ? enclosure.length : nil - return ParsedAttachment(url: enclosure.url, mimeType: enclosure.mimeType, title: nil, sizeInBytes: sizeInBytes, durationInSeconds: nil) - } - - return attachments.isEmpty ? nil : Set(attachments) - } -} +//struct RSParsedFeedTransformer { +// +// static func parsedFeed(_ rsParsedFeed: RSParsedFeed) -> ParsedFeed { +// +// let items = parsedItems(rsParsedFeed.articles) +// return ParsedFeed(type: .rss, title: rsParsedFeed.title, homePageURL: rsParsedFeed.link, feedURL: rsParsedFeed.urlString, language: rsParsedFeed.language, feedDescription: nil, nextURL: nil, iconURL: nil, faviconURL: nil, authors: nil, expired: false, hubs: nil, items: items) +// } +//} +// +//private extension RSParsedFeedTransformer { +// +// static func parsedItems(_ parsedArticles: Set) -> Set { +// +// // Create Set from Set +// +// return Set(parsedArticles.map(parsedItem)) +// } +// +// static func parsedItem(_ parsedArticle: RSParsedArticle) -> ParsedItem { +// +// let uniqueID = parsedArticle.articleID +// let url = parsedArticle.permalink +// let externalURL = parsedArticle.link +// let title = parsedArticle.title +// let language = parsedArticle.language +// let contentHTML = parsedArticle.body +// let datePublished = parsedArticle.datePublished +// let dateModified = parsedArticle.dateModified +// let authors = parsedAuthors(parsedArticle.authors) +// let attachments = parsedAttachments(parsedArticle.enclosures) +// +// return ParsedItem(syncServiceID: nil, uniqueID: uniqueID, feedURL: parsedArticle.feedURL, url: url, externalURL: externalURL, title: title, language: language, contentHTML: contentHTML, contentText: nil, summary: nil, imageURL: nil, bannerImageURL: nil, datePublished: datePublished, dateModified: dateModified, authors: authors, tags: nil, attachments: attachments) +// } +// +// static func parsedAuthors(_ authors: Set?) -> Set? { +// +// guard let authors = authors, !authors.isEmpty else { +// return nil +// } +// +// let transformedAuthors = authors.compactMap { (author) -> ParsedAuthor? in +// return ParsedAuthor(name: author.name, url: author.url, avatarURL: nil, emailAddress: author.emailAddress) +// } +// +// return transformedAuthors.isEmpty ? nil : Set(transformedAuthors) +// } +// +// static func parsedAttachments(_ enclosures: Set?) -> Set? { +// +// guard let enclosures = enclosures, !enclosures.isEmpty else { +// return nil +// } +// +// let attachments = enclosures.compactMap { (enclosure) -> ParsedAttachment? in +// +// let sizeInBytes = enclosure.length > 0 ? enclosure.length : nil +// return ParsedAttachment(url: enclosure.url, mimeType: enclosure.mimeType, title: nil, sizeInBytes: sizeInBytes, durationInSeconds: nil) +// } +// +// return attachments.isEmpty ? nil : Set(attachments) +// } +//} diff --git a/Modules/Parser/Sources/FeedParser/Feeds/XML/RSSParser.swift b/Modules/Parser/Sources/FeedParser/Feeds/XML/RSSParser.swift index 6410d8ae8..64b1680f0 100644 --- a/Modules/Parser/Sources/FeedParser/Feeds/XML/RSSParser.swift +++ b/Modules/Parser/Sources/FeedParser/Feeds/XML/RSSParser.swift @@ -33,7 +33,7 @@ public final class RSSParser { private var parsingAuthor = false private var currentAttributes: SAXParser.XMLAttributesDictionary? - public static func parsedFeed(with parserData: ParserData) -> RSSFeed { + static func parsedFeed(with parserData: ParserData) -> RSSFeed { let parser = RSSParser(parserData) parser.parse() @@ -48,6 +48,12 @@ public final class RSSParser { private extension RSSParser { + func parse() { + + let saxParser = SAXParser(delegate: self, data: data) + saxParser.parse() + } + private struct XMLName { static let uppercaseRDF = "RDF".utf8CString static let item = "item".utf8CString @@ -63,9 +69,13 @@ private extension RSSParser { static let dc = "dc".utf8CString static let content = "content".utf8CString static let encoded = "encoded".utf8CString + static let creator = "creator".utf8CString + static let date = "date".utf8CString + static let pubDate = "pubDate".utf8CString + static let description = "description".utf8CString } - func addFeedElement(_ localName: XMLPointer, _ prefix: XMLPointer?) { + func addFeedElement(_ saxParser: SAXParser, _ localName: XMLPointer, _ prefix: XMLPointer?) { guard prefix == nil else { return @@ -73,14 +83,14 @@ private extension RSSParser { if SAXEqualTags(localName, XMLName.link) { if feed.link == nil { - feed.link = currentString + feed.link = saxParser.currentString } } else if SAXEqualTags(localName, XMLName.title) { - feed.title = currentString + feed.title = saxParser.currentString } else if SAXEqualTags(localName, XMLName.language) { - feed.language = currentString + feed.language = saxParser.currentString } } @@ -91,13 +101,17 @@ private extension RSSParser { func addArticleElement(_ saxParser: SAXParser, _ localName: XMLPointer, _ prefix: XMLPointer?) { - if SAXEqualTags(prefix, XMLName.dc) { - addDCElement(localName) - return; + guard let currentArticle else { + return } - if SAXEqualTags(prefix, XMLName.content) && SAXEqualTags(localName, XMLName.encoded) { - if let currentString, !currentString.isEmpty { + if let prefix, SAXEqualTags(prefix, XMLName.dc) { + addDCElement(saxParser, localName, currentArticle) + return + } + + if let prefix, SAXEqualTags(prefix, XMLName.content) && SAXEqualTags(localName, XMLName.encoded) { + if let currentString = saxParser.currentString, !currentString.isEmpty { currentArticle.body = currentString } return @@ -107,40 +121,171 @@ private extension RSSParser { return } - if SAXEqualTags(localName, XMLName.guid) { - addGuid() + if let currentString = saxParser.currentString { + if SAXEqualTags(localName, XMLName.guid) { + addGuid(currentString, currentArticle) + } + else if SAXEqualTags(localName, XMLName.author) { + addAuthorWithString(currentString, currentArticle) + } + else if SAXEqualTags(localName, XMLName.link) { + currentArticle.link = urlString(currentString) + } + else if SAXEqualTags(localName, XMLName.description) { + if currentArticle.body == nil { + currentArticle.body = currentString + } + } + else if !parsingAuthor && SAXEqualTags(localName, XMLName.title) { + currentArticle.title = currentString + } } else if SAXEqualTags(localName, XMLName.pubDate) { currentArticle.datePublished = currentDate(saxParser) } - else if SAXEqualTags(localName, XMLName.author) { - addAuthorWithString(currentString) + else if SAXEqualTags(localName, XMLName.enclosure), let currentAttributes { + addEnclosure(currentAttributes, currentArticle) } - else if SAXEqualTags(localName, XMLName.link) { - currentArticle.link = urlString(currentString) - } - else if SAXEqualTags(localName, XMLName.description) { - if currentArticle.body == nil { - currentArticle.body = currentString + } + + func addDCElement(_ saxParser: SAXParser, _ localName: XMLPointer, _ currentArticle: RSSArticle) { + + if SAXEqualTags(localName, XMLName.creator) { + if let currentString = saxParser.currentString { + addAuthorWithString(currentString, currentArticle) } } - else if !parsingAuthor && SAXEqualTags(localName, XMLName.title) { - if let currentString { - currentArticle.title = currentString + else if SAXEqualTags(localName, XMLName.date) { + currentArticle.datePublished = currentDate(saxParser) + } + } + + static let isPermalinkKey = "isPermaLink" + static let isPermalinkLowercaseKey = "ispermalink" + static let falseValue = "false" + + func addGuid(_ guid: String, _ currentArticle: RSSArticle) { + + currentArticle.guid = guid + + guard let currentAttributes else { + return + } + + let isPermaLinkValue: String? = { + + if let value = currentAttributes[Self.isPermalinkKey] { + return value } + // Allow for `ispermalink`, `isPermalink`, etc. + for (key, value) in currentAttributes { + if key.lowercased() == Self.isPermalinkLowercaseKey { + return value + } + } + + return nil + }() + + // Spec: `isPermaLink is optional, its default value is true.` + // https://cyber.harvard.edu/rss/rss.html#ltguidgtSubelementOfLtitemgt + // Return only if non-nil and equal to false — otherwise it’s a permalink. + if let isPermaLinkValue, isPermaLinkValue == Self.falseValue { + return } - else if SAXEqualTags(localName, XMLName.enclosure) { - addEnclosure() + + // Feed bug found in the wild: using a guid that’s not really a permalink + // and not realizing that `isPermaLink` is true by default. + if stringIsProbablyAURLOrRelativePath(guid) { + currentArticle.permalink = urlString(guid) } } + func stringIsProbablyAURLOrRelativePath(_ s: String) -> Bool { + + // The RSS guid is defined as a permalink, except when it appears like this: + // `some—identifier` + // However, people often seem to think it’s *not* a permalink by default, even + // though it is. So we try to detect the situation where the value is not a URL string, + // and not even a relative path. This may need to evolve over time. + + if !s.contains("/") { + // This seems to be just about the best possible check. + // Bad guids are often just integers, for instance. + return false + } + + if s.lowercased().hasPrefix("tag:") { + // A common non-URL guid form starts with `tag:`. + return false + } + + return true + } + + /// Do best attempt at turning a string into a URL string. + /// + /// If it already appears to be a URL, return it. + /// Otherwise, treat it like a relative URL and resolve using + /// the URL of the home page of the feed (if available) + /// or the URL of the feed. + /// + /// The returned value is not guaranteed to be a valid URL string. + /// It’s a best attempt without going to heroic lengths. + func urlString(_ s: String) -> String { + + if s.lowercased().hasPrefix("http") { + return s + } + + let baseURLString = feed.link ?? feedURL + guard let baseURL = URL(string: baseURLString) else { + return s + } + guard let resolvedURL = URL(string: s, relativeTo: baseURL) else { + return s + } + + return resolvedURL.absoluteString + } + + func addAuthorWithString(_ authorString: String, _ currentArticle: RSSArticle) { + + if authorString.isEmpty { + return + } + + let author = RSSAuthor(singleString: authorString) + currentArticle.addAuthor(author) + } + + private struct EnclosureKey { + static let url = "url" + static let length = "length" + static let type = "type" + } + + func addEnclosure(_ attributes: SAXParser.XMLAttributesDictionary, _ currentArticle: RSSArticle) { + + guard let url = attributes[EnclosureKey.url], !url.isEmpty else { + return + } + + let enclosure = RSSEnclosure(url: url) + if let lengthValue = attributes[EnclosureKey.length], let length = Int(lengthValue) { + enclosure.length = length + } + enclosure.mimeType = attributes[EnclosureKey.type] + + currentArticle.addEnclosure(enclosure) + } + func currentDate(_ saxParser: SAXParser) -> Date? { guard let data = saxParser.currentCharacters else { return nil } return DateParser.date(data: data) - } } @@ -157,8 +302,8 @@ extension RSSParser: SAXParserDelegate { return } - var xmlAttributes: XMLAttributesDictionary? = nil - if (isRDF && SAXEqualTags(localName, XMLName.item)) || SAXEqualTags(localName, XMLName.guid) || SAXEqualTags(enclosure, XMLName.enclosure) { + var xmlAttributes: SAXParser.XMLAttributesDictionary? = nil + if (isRDF && SAXEqualTags(localName, XMLName.item)) || SAXEqualTags(localName, XMLName.guid) || SAXEqualTags(localName, XMLName.enclosure) { xmlAttributes = saxParser.attributesDictionary(attributes, attributeCount: attributeCount) } if currentAttributes != xmlAttributes { @@ -169,7 +314,7 @@ extension RSSParser: SAXParserDelegate { addArticle() parsingArticle = true - if isRDF && let rdfGuid = xmlAttributes?[XMLName.rdfAbout], let currentArticle { // RSS 1.0 guid + if isRDF, let rdfGuid = xmlAttributes?[XMLName.rdfAbout], let currentArticle { // RSS 1.0 guid currentArticle.guid = rdfGuid currentArticle.permalink = rdfGuid } diff --git a/Modules/Parser/Sources/SAX/SAXUtilities.swift b/Modules/Parser/Sources/SAX/SAXUtilities.swift index dccda4e9c..10ba86f0a 100644 --- a/Modules/Parser/Sources/SAX/SAXUtilities.swift +++ b/Modules/Parser/Sources/SAX/SAXUtilities.swift @@ -1,5 +1,5 @@ // -// File.swift +// SAXUtilities.swift // // // Created by Brent Simmons on 8/26/24.