Restore changes reverted in previous beta.

This commit is contained in:
Brent Simmons
2025-05-09 20:30:04 -07:00
parent 0a9c323dfc
commit 4d930dd5e4
12 changed files with 216 additions and 85 deletions

View File

@@ -189,13 +189,28 @@ public extension String {
/// Removes an HTML tag and everything between its start and end tags.
///
/// The regex pattern `<tag>[\\s\\S]*?</tag>` explanation:
/// - `<` matches the literal `<` character.
/// - `tag` matches the literal parameter provided to the function, e.g., `style`.
/// - `>` matches the literal `>` character.
/// - `[\\s\\S]*?`
/// - `[\\s\\S]` matches _any_ character, including new lines.
/// - `*` will match zero or more of the preceeding character, in this case _any_
/// character.
/// - `?` switches the matching mode to [lazy](https://javascript.info/regexp-greedy-and-lazy)
/// so it will match as few as characters as possible before satisfying the rest of the pattern.
/// - `<` matches the literal `<` character.
/// - `/` matches the literal `/` character.
/// - `tag` matches the literal parameter provided to the function, e.g., `style`.
/// - `>` matches the literal `>` character.
///
/// - Parameter tag: The tag to remove.
///
/// - Returns: A new copy of `self` with the tag removed.
///
/// - Note: Doesn't work correctly with nested tags of the same name.
private func removingTagAndContents(_ tag: String) -> String {
return self.replacingOccurrences(of: "<\(tag).+?</\(tag)>", with: "", options: [.regularExpression, .caseInsensitive])
return self.replacingOccurrences(of: "<\(tag)>[\\s\\S]*?</\(tag)>", with: "", options: [.regularExpression, .caseInsensitive])
}
/// Strips HTML from a string.

View File

@@ -7,25 +7,34 @@
//
import Foundation
import os
import RSCore
public typealias DownloadCallback = (Data?, URLResponse?, Error?) -> Swift.Void
public typealias DownloadCallback = @MainActor (Data?, URLResponse?, Error?) -> Swift.Void
/// Simple downloader, for a one-shot download like an image
/// or a web page. For a download-feeds session, see DownloadSession.
public final class Downloader {
/// Caches response for a short time for GET requests. May return cached response.
@MainActor public final class Downloader {
public static let shared = Downloader()
private let urlSession: URLSession
private var callbacks = [URL: [DownloadCallback]]()
// Cache  short-lived
private let cache = Cache<DownloaderRecord>(timeToLive: 60 * 3, timeBetweenCleanups: 60 * 2)
nonisolated private static let logger = Logger(subsystem: Bundle.main.bundleIdentifier!, category: "Downloader")
nonisolated private static let debugLoggingEnabled = false
private init() {
let sessionConfiguration = URLSessionConfiguration.ephemeral
sessionConfiguration.requestCachePolicy = .reloadIgnoringLocalCacheData
sessionConfiguration.httpShouldSetCookies = false
sessionConfiguration.httpCookieAcceptPolicy = .never
sessionConfiguration.httpMaximumConnectionsPerHost = 1
sessionConfiguration.httpCookieStorage = nil
if let userAgentHeaders = UserAgent.headers() {
sessionConfiguration.httpAdditionalHeaders = userAgentHeaders
}
@@ -37,20 +46,103 @@ public final class Downloader {
urlSession.invalidateAndCancel()
}
public func download(_ url: URL, _ completion: DownloadCallback? = nil) {
download(URLRequest(url: url), completion)
public func download(_ url: URL, _ callback: @escaping DownloadCallback) {
assert(Thread.isMainThread)
download(URLRequest(url: url), callback)
}
public func download(_ urlRequest: URLRequest, _ completion: DownloadCallback? = nil) {
public func download(_ urlRequest: URLRequest, _ callback: @escaping DownloadCallback) {
assert(Thread.isMainThread)
guard let url = urlRequest.url else {
Self.logger.fault("Downloader: skipping download for URLRequest without a URL")
return
}
let isCacheableRequest = urlRequest.httpMethod == HTTPMethod.get
// Return cached record if available.
if isCacheableRequest {
if let cachedRecord = cache[url.absoluteString] {
if Self.debugLoggingEnabled {
Self.logger.debug("Downloader: returning cached record for \(url)")
}
callback(cachedRecord.data, cachedRecord.response, cachedRecord.error)
return
}
}
// Add callback. If there is already a download in progress for this URL, return early.
if callbacks[url] == nil {
if Self.debugLoggingEnabled {
Self.logger.debug("Downloader: downloading \(url)")
}
callbacks[url] = [callback]
} else {
// A download is already be in progress for this URL. Dont start a separate download.
// Add the callback to the callbacks array for this URL.
if Self.debugLoggingEnabled {
Self.logger.debug("Downloader: download in progress for \(url) — adding callback")
}
callbacks[url]?.append(callback)
return
}
var urlRequestToUse = urlRequest
urlRequestToUse.addSpecialCaseUserAgentIfNeeded()
let task = urlSession.dataTask(with: urlRequestToUse) { (data, response, error) in
DispatchQueue.main.async() {
completion?(data, response, error)
if isCacheableRequest {
if Self.debugLoggingEnabled {
Self.logger.debug("Downloader: caching record for \(url)")
}
let cachedRecord = DownloaderRecord(data: data, response: response, error: error)
self.cache[url.absoluteString] = cachedRecord
}
Task { @MainActor in
self.callAndReleaseCallbacks(url, data, response, error)
}
}
task.resume()
}
}
private extension Downloader {
func callAndReleaseCallbacks(_ url: URL, _ data: Data? = nil, _ response: URLResponse? = nil, _ error: Error? = nil) {
assert(Thread.isMainThread)
defer {
callbacks[url] = nil
}
guard let callbacksForURL = callbacks[url] else {
assertionFailure("Downloader: downloaded URL \(url) but no callbacks found")
Self.logger.fault("Downloader: downloaded URL \(url) but no callbacks found")
return
}
if Self.debugLoggingEnabled {
let count = callbacksForURL.count
if count == 1 {
Self.logger.debug("Downloader: calling 1 callback for URL \(url)")
} else {
Self.logger.debug("Downloader: calling \(count) callbacks for URL \(url)")
}
}
for callback in callbacksForURL {
callback(data, response, error)
}
}
}
struct DownloaderRecord: CacheRecord, Sendable {
let dateCreated = Date()
let data: Data?
let response: URLResponse?
let error: Error?
}

View File

@@ -75,36 +75,38 @@ private extension HTMLMetadataDownloader {
}
func downloadMetadata(_ url: String) {
guard let actualURL = URL(string: url) else {
if Self.debugLoggingEnabled {
Self.logger.debug("HTMLMetadataDownloader skipping download for \(url) because it couldnt construct a URL.")
}
return
}
if Self.debugLoggingEnabled {
Self.logger.debug("HTMLMetadataDownloader downloading for \(url)")
}
Downloader.shared.download(actualURL) { data, response, error in
if let data, !data.isEmpty, let response, response.statusIsOK {
let urlToUse = response.url ?? actualURL
let parserData = ParserData(url: urlToUse.absoluteString, data: data)
let htmlMetadata = RSHTMLMetadataParser.htmlMetadata(with: parserData)
if Self.debugLoggingEnabled {
Self.logger.debug("HTMLMetadataDownloader caching parsed metadata for \(url)")
Task { @MainActor in
Downloader.shared.download(actualURL) { data, response, error in
if let data, !data.isEmpty, let response, response.statusIsOK {
let urlToUse = response.url ?? actualURL
let parserData = ParserData(url: urlToUse.absoluteString, data: data)
let htmlMetadata = RSHTMLMetadataParser.htmlMetadata(with: parserData)
if Self.debugLoggingEnabled {
Self.logger.debug("HTMLMetadataDownloader caching parsed metadata for \(url)")
}
self.cache[url] = htmlMetadata
return
}
if let statusCode = response?.forcedStatusCode, (400...499).contains(statusCode) {
self.noteURLDidReturn4xx(url)
}
if Self.debugLoggingEnabled {
Self.logger.debug("HTMLMetadataDownloader failed download for \(url)")
}
self.cache[url] = htmlMetadata
return
}
if let statusCode = response?.forcedStatusCode, (400...499).contains(statusCode) {
self.noteURLDidReturn4xx(url)
}
if Self.debugLoggingEnabled {
Self.logger.debug("HTMLMetadataDownloader failed download for \(url)")
}
}
}