diff --git a/README.md b/README.md index d49600980..55ff0ce5b 100644 --- a/README.md +++ b/README.md @@ -1281,7 +1281,7 @@ arguments: query: string # The search query for searching with Brave output: - result: string # The result of the Brave Search + result: list[dict] # A list of search results, each containing: title, link, snippet ``` @@ -1356,11 +1356,11 @@ setup: arguments: url: string # The URL for which to fetch data - mode: string # The type of crawlers (default: "scrape") params: dict # (Optional) The parameters for the Spider API + content_type: string # (Optional) The content type to return. Default is "application/json". Other options: "text/csv", "application/xml", "application/jsonl" output: - documents: list # The documents returned from the spider + result: list[dict] # A list of results, each containing: content, error, status, costs, url ``` @@ -1452,7 +1452,7 @@ arguments: base64: boolean # Whether the input file is base64 encoded. Default is false. output: - documents: list # The parsed data from the document + documents: list[Document] # A list of parsed documents ``` @@ -1520,7 +1520,7 @@ arguments: sort_order: string # The sort order for the results, options: ascending, descending output: - result: list # A list of search results, each containing: entry_id, title, updated, published, authors, summary, comment, journal_ref, doi, primary_category, categories, links, pdf_url, pdf_downloaded + result: list[dict] # A list of search results, each containing: entry_id, title, updated, published, authors, summary, comment, journal_ref, doi, primary_category, categories, links, pdf_url, pdf_downloaded ``` diff --git a/agents-api/agents_api/autogen/Tools.py b/agents-api/agents_api/autogen/Tools.py index c836e041a..d872674af 100644 --- a/agents-api/agents_api/autogen/Tools.py +++ b/agents-api/agents_api/autogen/Tools.py @@ -1639,9 +1639,11 @@ class SpiderFetchArguments(BaseModel): """ The URL to fetch data from """ - mode: Literal["crawl", "scrape"] = "scrape" + content_type: Literal[ + "application/json", "text/csv", "application/xml", "application/jsonl" + ] = "application/json" """ - The type of crawler to use + The content type to return """ params: dict[str, Any] | None = None """ @@ -1661,9 +1663,11 @@ class SpiderFetchArgumentsUpdate(BaseModel): """ The URL to fetch data from """ - mode: Literal["crawl", "scrape"] = "scrape" + content_type: Literal[ + "application/json", "text/csv", "application/xml", "application/jsonl" + ] = "application/json" """ - The type of crawler to use + The content type to return """ params: dict[str, Any] | None = None """ @@ -1683,7 +1687,7 @@ class SpiderIntegrationDef(BaseIntegrationDef): """ The provider must be "spider" """ - method: str | None = None + method: Literal["crawl", "links", "screenshot", "search"] | None = None """ The specific method of the integration to call """ @@ -1709,7 +1713,7 @@ class SpiderIntegrationDefUpdate(BaseIntegrationDefUpdate): """ The provider must be "spider" """ - method: str | None = None + method: Literal["crawl", "links", "screenshot", "search"] | None = None """ The specific method of the integration to call """ diff --git a/integrations-service/integrations/autogen/Tools.py b/integrations-service/integrations/autogen/Tools.py index c836e041a..d872674af 100644 --- a/integrations-service/integrations/autogen/Tools.py +++ b/integrations-service/integrations/autogen/Tools.py @@ -1639,9 +1639,11 @@ class SpiderFetchArguments(BaseModel): """ The URL to fetch data from """ - mode: Literal["crawl", "scrape"] = "scrape" + content_type: Literal[ + "application/json", "text/csv", "application/xml", "application/jsonl" + ] = "application/json" """ - The type of crawler to use + The content type to return """ params: dict[str, Any] | None = None """ @@ -1661,9 +1663,11 @@ class SpiderFetchArgumentsUpdate(BaseModel): """ The URL to fetch data from """ - mode: Literal["crawl", "scrape"] = "scrape" + content_type: Literal[ + "application/json", "text/csv", "application/xml", "application/jsonl" + ] = "application/json" """ - The type of crawler to use + The content type to return """ params: dict[str, Any] | None = None """ @@ -1683,7 +1687,7 @@ class SpiderIntegrationDef(BaseIntegrationDef): """ The provider must be "spider" """ - method: str | None = None + method: Literal["crawl", "links", "screenshot", "search"] | None = None """ The specific method of the integration to call """ @@ -1709,7 +1713,7 @@ class SpiderIntegrationDefUpdate(BaseIntegrationDefUpdate): """ The provider must be "spider" """ - method: str | None = None + method: Literal["crawl", "links", "screenshot", "search"] | None = None """ The specific method of the integration to call """ diff --git a/integrations-service/integrations/models/__init__.py b/integrations-service/integrations/models/__init__.py index e5fe3f218..97b4564fc 100644 --- a/integrations-service/integrations/models/__init__.py +++ b/integrations-service/integrations/models/__init__.py @@ -53,6 +53,11 @@ from .ffmpeg import FfmpegSearchOutput as FfmpegSearchOutput from .llama_parse import LlamaParseFetchOutput as LlamaParseFetchOutput from .remote_browser import RemoteBrowserOutput as RemoteBrowserOutput -from .spider import SpiderFetchOutput as SpiderFetchOutput +from .spider import ( + SpiderOutput as SpiderOutput, +) +from .spider import ( + SpiderResponse as SpiderResponse, +) from .weather import WeatherGetOutput as WeatherGetOutput from .wikipedia import WikipediaSearchOutput as WikipediaSearchOutput diff --git a/integrations-service/integrations/models/execution.py b/integrations-service/integrations/models/execution.py index 397782e87..42cae6cbc 100644 --- a/integrations-service/integrations/models/execution.py +++ b/integrations-service/integrations/models/execution.py @@ -50,7 +50,7 @@ from .ffmpeg import FfmpegSearchOutput from .llama_parse import LlamaParseFetchOutput from .remote_browser import RemoteBrowserOutput -from .spider import SpiderFetchOutput +from .spider import SpiderOutput from .weather import WeatherGetOutput from .wikipedia import WikipediaSearchOutput @@ -98,7 +98,6 @@ class ExecutionError(BaseModel): ] ExecutionResponse = Union[ - SpiderFetchOutput, WeatherGetOutput, EmailOutput, WikipediaSearchOutput, @@ -118,6 +117,7 @@ class ExecutionError(BaseModel): CloudinaryUploadOutput, ExecutionError, ArxivSearchOutput, + SpiderOutput, ] diff --git a/integrations-service/integrations/models/spider.py b/integrations-service/integrations/models/spider.py index 989e8411f..4acfd8a66 100644 --- a/integrations-service/integrations/models/spider.py +++ b/integrations-service/integrations/models/spider.py @@ -1,10 +1,19 @@ -from langchain_core.documents import Document -from pydantic import Field +from typing import Any, List, Optional + +from pydantic import BaseModel, Field from .base_models import BaseOutput -class SpiderFetchOutput(BaseOutput): - documents: list[Document] = Field( - ..., description="The documents returned from the spider" +class SpiderResponse(BaseModel): + content: Optional[str] = None + error: Optional[str] = None + status: Optional[int] = None + costs: Optional[dict[Any, Any]] = None + url: Optional[str] = None + + +class SpiderOutput(BaseOutput): + result: List[SpiderResponse] = Field( + ..., description="The responses from the spider" ) diff --git a/integrations-service/integrations/providers.py b/integrations-service/integrations/providers.py index 5fb29b75c..e4a5af9cc 100644 --- a/integrations-service/integrations/providers.py +++ b/integrations-service/integrations/providers.py @@ -3,7 +3,6 @@ # Arguments imports BraveSearchArguments, # Setup imports - # Setup imports BraveSearchSetup, BrowserbaseCompleteSessionArguments, BrowserbaseCreateSessionArguments, @@ -48,7 +47,7 @@ LlamaParseFetchOutput, ProviderInfo, RemoteBrowserOutput, - SpiderFetchOutput, + SpiderOutput, WeatherGetOutput, WikipediaSearchOutput, ) @@ -99,7 +98,25 @@ method="crawl", description="Crawl a website and extract data", arguments=SpiderFetchArguments, - output=SpiderFetchOutput, + output=SpiderOutput, + ), + BaseProviderMethod( + method="links", + description="Extract all links from the webpage", + arguments=SpiderFetchArguments, + output=SpiderOutput, + ), + BaseProviderMethod( + method="screenshot", + description="Take a screenshot of the webpage", + arguments=SpiderFetchArguments, + output=SpiderOutput, + ), + BaseProviderMethod( + method="search", + description="Search content within the webpage", + arguments=SpiderFetchArguments, + output=SpiderOutput, ), ], info=ProviderInfo( diff --git a/integrations-service/integrations/utils/integrations/spider.py b/integrations-service/integrations/utils/integrations/spider.py index ed2129588..a4bd35071 100644 --- a/integrations-service/integrations/utils/integrations/spider.py +++ b/integrations-service/integrations/utils/integrations/spider.py @@ -1,17 +1,28 @@ from beartype import beartype -from langchain_community.document_loaders import SpiderLoader +from spider import AsyncSpider from tenacity import retry, stop_after_attempt, wait_exponential from ...autogen.Tools import SpiderFetchArguments, SpiderSetup from ...env import ( spider_api_key, # Import env to access environment variables ) -from ...models import SpiderFetchOutput +from ...models import SpiderOutput, SpiderResponse # Spider client instances -def get_spider_client(api_key: str, **kwargs) -> SpiderLoader: - return SpiderLoader(api_key=api_key, **kwargs) +def get_spider_client(api_key: str) -> AsyncSpider: + return AsyncSpider(api_key=api_key) + + +def get_api_key(setup: SpiderSetup) -> str: + """ + Helper function to get the API key. + """ + return ( + setup.spider_api_key + if setup.spider_api_key != "DEMO_API_KEY" + else spider_api_key + ) @beartype @@ -20,28 +31,169 @@ def get_spider_client(api_key: str, **kwargs) -> SpiderLoader: reraise=True, stop=stop_after_attempt(4), ) -async def crawl( +async def crawl(setup: SpiderSetup, arguments: SpiderFetchArguments) -> SpiderOutput: + """ + Crawl a website and extract data. + """ + assert isinstance(setup, SpiderSetup), "Invalid setup" + assert isinstance(arguments, SpiderFetchArguments), "Invalid arguments" + + api_key = get_api_key(setup) + + # Initialize final_result + final_result = [] + results = None + + # Initialize spider_client + async with get_spider_client(api_key=api_key) as spider_client: + async for result in spider_client.crawl_url( + url=str(arguments.url), + params=arguments.params, + stream=False, + content_type=arguments.content_type, + ): + results = result + + for page in results: + final_result.append( + SpiderResponse( + url=page["url"] if page["url"] is not None else None, + content=page["content"] if page["content"] is not None else None, + error=page["error"] if page["error"] is not None else None, + status=page["status"] if page["status"] is not None else None, + costs=page["costs"] if page["costs"] is not None else None, + ) + ) + # Return final_result + return SpiderOutput(result=final_result) + + +@beartype +@retry( + wait=wait_exponential(multiplier=1, min=4, max=10), + reraise=True, + stop=stop_after_attempt(4), +) +async def links(setup: SpiderSetup, arguments: SpiderFetchArguments) -> SpiderOutput: + """ + Extract all links from the webpage. + """ + assert isinstance(setup, SpiderSetup), "Invalid setup" + assert isinstance(arguments, SpiderFetchArguments), "Invalid arguments" + + api_key = get_api_key(setup) + + # Initialize final_result + final_result = [] + results = None + + # Initialize spider_client + async with get_spider_client(api_key=api_key) as spider_client: + async for result in spider_client.links( + url=str(arguments.url), + params=arguments.params, + stream=False, + content_type=arguments.content_type, + ): + results = result + + for page in results: + final_result.append( + SpiderResponse( + url=page["url"] if page["url"] is not None else None, + content=page["content"] if page["content"] is not None else None, + error=page["error"] if page["error"] is not None else None, + status=page["status"] if page["status"] is not None else None, + costs=page["costs"] if page["costs"] is not None else None, + ) + ) + # Return final_result + return SpiderOutput(result=final_result) + + +@beartype +@retry( + wait=wait_exponential(multiplier=1, min=4, max=10), + reraise=True, + stop=stop_after_attempt(4), +) +async def screenshot( setup: SpiderSetup, arguments: SpiderFetchArguments -) -> SpiderFetchOutput: +) -> SpiderOutput: """ - Fetches data from a specified URL. + Take a screenshot of the webpage. """ + assert isinstance(setup, SpiderSetup), "Invalid setup" + assert isinstance(arguments, SpiderFetchArguments), "Invalid arguments" + + api_key = get_api_key(setup) + + # Initialize final_result + final_result = [] + results = None + + # Initialize spider_client + async with get_spider_client(api_key=api_key) as spider_client: + async for result in spider_client.screenshot( + url=str(arguments.url), + params=arguments.params, + stream=False, + content_type=arguments.content_type, + ): + results = result + + for page in results: + final_result.append( + SpiderResponse( + url=page["url"] if page["url"] is not None else None, + content=page["content"] if page["content"] is not None else None, + error=page["error"] if page["error"] is not None else None, + status=page["status"] if page["status"] is not None else None, + costs=page["costs"] if page["costs"] is not None else None, + ) + ) + # Return final_result + return SpiderOutput(result=final_result) + +@beartype +@retry( + wait=wait_exponential(multiplier=1, min=4, max=10), + reraise=True, + stop=stop_after_attempt(4), +) +async def search(setup: SpiderSetup, arguments: SpiderFetchArguments) -> SpiderOutput: + """ + Search content within the webpage. + """ assert isinstance(setup, SpiderSetup), "Invalid setup" assert isinstance(arguments, SpiderFetchArguments), "Invalid arguments" - api_key = ( - setup.spider_api_key - if setup.spider_api_key != "DEMO_API_KEY" - else spider_api_key - ) + api_key = get_api_key(setup) - spider_loader = get_spider_client( - api_key=api_key, - url=str(arguments.url), - mode=arguments.mode, - params=arguments.params, - ) + # Initialize final_result + final_result = [] + results = None + + # Initialize spider_client + async with get_spider_client(api_key=api_key) as spider_client: + async for result in spider_client.search( + url=str(arguments.url), + params=arguments.params, + stream=False, + content_type=arguments.content_type, + ): + results = result - documents = await spider_loader.aload() - return SpiderFetchOutput(documents=documents) + for page in results: + final_result.append( + SpiderResponse( + url=page["url"] if page["url"] is not None else None, + content=page["content"] if page["content"] is not None else None, + error=page["error"] if page["error"] is not None else None, + status=page["status"] if page["status"] is not None else None, + costs=page["costs"] if page["costs"] is not None else None, + ) + ) + # Return final_result + return SpiderOutput(result=final_result) diff --git a/typespec/tools/spider.tsp b/typespec/tools/spider.tsp index be28cd8fe..e146add61 100644 --- a/typespec/tools/spider.tsp +++ b/typespec/tools/spider.tsp @@ -15,20 +15,30 @@ model SpiderFetchArguments { /** The URL to fetch data from */ url: url; - /** The type of crawler to use */ - mode?: "crawl" | "scrape" = "scrape"; + /** The content type to return */ + content_type?: "application/json" | "text/csv" | "application/xml" | "application/jsonl" = "application/json"; /** Additional parameters for the Spider API */ params?: Record; } +alias SpiderMethod = + | /** Crawl a website and extract data */ + "crawl" + | /** Retrieve links from the specified URL. */ + "links" + | /** Take a screenshot of the specified URL. */ + "screenshot" + | /** Perform a search and gather a list of websites to start crawling and collect resources. */ + "search"; + /** Spider integration definition */ model SpiderIntegrationDef extends BaseIntegrationDef { /** The provider must be "spider" */ provider: "spider" = "spider"; /** The specific method of the integration to call */ - method?: string; + method?: SpiderMethod; /** The setup parameters for Spider */ setup?: SpiderSetup; diff --git a/typespec/tsp-output/@typespec/openapi3/openapi-1.0.0.yaml b/typespec/tsp-output/@typespec/openapi3/openapi-1.0.0.yaml index d0521ae0a..706c16bf0 100644 --- a/typespec/tsp-output/@typespec/openapi3/openapi-1.0.0.yaml +++ b/typespec/tsp-output/@typespec/openapi3/openapi-1.0.0.yaml @@ -8378,13 +8378,15 @@ components: type: string format: uri description: The URL to fetch data from - mode: + content_type: type: string enum: - - crawl - - scrape - description: The type of crawler to use - default: scrape + - application/json + - text/csv + - application/xml + - application/jsonl + description: The content type to return + default: application/json params: type: object additionalProperties: {} @@ -8397,13 +8399,15 @@ components: type: string format: uri description: The URL to fetch data from - mode: + content_type: type: string enum: - - crawl - - scrape - description: The type of crawler to use - default: scrape + - application/json + - text/csv + - application/xml + - application/jsonl + description: The content type to return + default: application/json params: type: object additionalProperties: {} @@ -8422,6 +8426,11 @@ components: default: spider method: type: string + enum: + - crawl + - links + - screenshot + - search description: The specific method of the integration to call setup: allOf: @@ -8445,6 +8454,11 @@ components: default: spider method: type: string + enum: + - crawl + - links + - screenshot + - search description: The specific method of the integration to call setup: allOf: