Skip to content

Commit

Permalink
Add new batch scrape API
Browse files Browse the repository at this point in the history
  • Loading branch information
rudrankriyam committed Oct 29, 2024
1 parent 621517c commit 684843c
Show file tree
Hide file tree
Showing 3 changed files with 184 additions and 2 deletions.
120 changes: 119 additions & 1 deletion Sources/AgniKit/AgniKit.swift
Original file line number Diff line number Diff line change
Expand Up @@ -286,4 +286,122 @@ public struct AgniKit {

return result
}
}

/// Performs a synchronous batch scrape operation
///
/// This method sends a POST request to scrape multiple URLs simultaneously and waits for all results.
///
/// - Parameters:
/// - urls: Array of URLs to scrape
/// - formats: Array of desired output formats (e.g. ["markdown", "html"]). Default is ["markdown"]
/// - onlyMainContent: Whether to return only the main content. Default is true
/// - timeout: Timeout in milliseconds. Default is 30000
///
/// - Returns: A BatchScrapeResponse containing all scraped results
///
/// - Throws: An error if the request fails or if the response cannot be decoded
public func batchScrape(
urls: [String],
formats: [String] = ["markdown"],
onlyMainContent: Bool = true,
timeout: Int = 30000
) async throws -> BatchScrapeResponse {
var request = makeRequest(for: "v1/batch/scrape")
request.httpMethod = "POST"
request.setValue("application/json", forHTTPHeaderField: "Content-Type")

let body: [String: Any] = [
"urls": urls,
"formats": formats,
"onlyMainContent": onlyMainContent,
"timeout": timeout
]

request.httpBody = try JSONSerialization.data(withJSONObject: body)

let (data, response) = try await URLSession.shared.data(for: request)

guard let httpResponse = response as? HTTPURLResponse else {
throw NSError(domain: "AgniKit", code: 0, userInfo: [NSLocalizedDescriptionKey: "Invalid response"])
}

guard httpResponse.statusCode == 200 else {
throw NSError(domain: "AgniKit", code: httpResponse.statusCode,
userInfo: [NSLocalizedDescriptionKey: "HTTP error \(httpResponse.statusCode)"])
}

let decoder = JSONDecoder()
decoder.dateDecodingStrategy = .iso8601
return try decoder.decode(BatchScrapeResponse.self, from: data)
}

/// Creates an asynchronous batch scrape job
///
/// This method initiates a batch scrape job and returns immediately with a job ID
///
/// - Parameters:
/// - urls: Array of URLs to scrape
/// - formats: Array of desired output formats (e.g. ["markdown", "html"]). Default is ["markdown"]
/// - onlyMainContent: Whether to return only the main content. Default is true
///
/// - Returns: A BatchScrapeJobResponse containing the job ID and status URL
///
/// - Throws: An error if the request fails or if the response cannot be decoded
public func createBatchScrapeJob(
urls: [String],
formats: [String] = ["markdown"],
onlyMainContent: Bool = true
) async throws -> BatchScrapeJobResponse {
var request = makeRequest(for: "v1/batch/scrape")
request.httpMethod = "POST"
request.setValue("application/json", forHTTPHeaderField: "Content-Type")

let body: [String: Any] = [
"urls": urls,
"formats": formats,
"onlyMainContent": onlyMainContent,
"async": true
]

request.httpBody = try JSONSerialization.data(withJSONObject: body)

let (data, response) = try await URLSession.shared.data(for: request)

guard let httpResponse = response as? HTTPURLResponse else {
throw NSError(domain: "AgniKit", code: 0, userInfo: [NSLocalizedDescriptionKey: "Invalid response"])
}

guard httpResponse.statusCode == 200 else {
throw NSError(domain: "AgniKit", code: httpResponse.statusCode,
userInfo: [NSLocalizedDescriptionKey: "HTTP error \(httpResponse.statusCode)"])
}

return try JSONDecoder().decode(BatchScrapeJobResponse.self, from: data)
}

/// Checks the status of an asynchronous batch scrape job
///
/// - Parameter id: The job ID returned from createBatchScrapeJob
///
/// - Returns: A BatchScrapeResponse containing the current status and any completed results
///
/// - Throws: An error if the request fails or if the response cannot be decoded
public func getBatchScrapeStatus(id: String) async throws -> BatchScrapeResponse {
let request = makeRequest(for: "v1/batch/scrape/\(id)")

let (data, response) = try await URLSession.shared.data(for: request)

guard let httpResponse = response as? HTTPURLResponse else {
throw NSError(domain: "AgniKit", code: 0, userInfo: [NSLocalizedDescriptionKey: "Invalid response"])
}

guard httpResponse.statusCode == 200 else {
throw NSError(domain: "AgniKit", code: httpResponse.statusCode,
userInfo: [NSLocalizedDescriptionKey: "HTTP error \(httpResponse.statusCode)"])
}

let decoder = JSONDecoder()
decoder.dateDecodingStrategy = .iso8601
return try decoder.decode(BatchScrapeResponse.self, from: data)
}
}
64 changes: 64 additions & 0 deletions Sources/AgniKit/BatchScrapeResponse.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import Foundation

/// Represents the response from a batch scrape operation
public struct BatchScrapeResponse: Codable {
/// The current status of the batch scrape job
public let status: String

/// Total number of URLs in the batch
public let total: Int

/// Number of completed URL scrapes
public let completed: Int

/// Number of API credits used for this operation
public let creditsUsed: Int

/// Timestamp when the results will expire
public let expiresAt: Date

/// Array of scraped results for each URL
public let data: [BatchScrapeResult]
}

/// Represents the scraped result for a single URL in a batch
public struct BatchScrapeResult: Codable {
/// The scraped content in markdown format (if requested)
public let markdown: String?

/// The scraped content in HTML format (if requested)
public let html: String?

/// Metadata about the scraped page
public let metadata: BatchScrapeMetadata
}

/// Metadata associated with a scraped page
public struct BatchScrapeMetadata: Codable {
/// Title of the webpage
public let title: String

/// Detected language of the content
public let language: String

/// Original URL that was scraped
public let sourceURL: String

/// Meta description of the webpage
public let description: String

/// HTTP status code of the response
public let statusCode: Int
}

/// Response for an asynchronous batch scrape job creation
public struct BatchScrapeJobResponse: Codable {
/// Whether the job was successfully created
public let success: Bool

/// The unique identifier for the batch scrape job
public let id: String

/// URL to check the job status
public let url: String
}
2 changes: 1 addition & 1 deletion Tests/AgniKitTests/AgniKitTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import Foundation
/// It verifies that the scraped content contains expected elements from the Apple documentation.
@Test("Scrape Apple's Defining Tests documentation")
func testWebScraping() async throws {
let agniKit = AgniKit(apiKey: "no")
let agniKit = AgniKit(apiKey: "fc-")

let url = "https://developer.apple.com/documentation/testing/definingtests"

Expand Down

0 comments on commit 684843c

Please sign in to comment.