feat(integrations): spider tool refractored + readme/typespec updates

julep-ai · Dec 3, 2024 · 0c55e12 · 0c55e12
1 parent a256a69
commit 0c55e12
Show file tree

Hide file tree

Showing 10 changed files with 276 additions and 61 deletions.
diff --git a/README.md b/README.md
@@ -1281,7 +1281,7 @@ arguments:
   query: string # The search query for searching with Brave
 
 output:
-  result: string # The result of the Brave Search
+  result: list[dict] # A list of search results, each containing: title, link, snippet
 ```
 
 </td>
@@ -1356,11 +1356,11 @@ setup:
 
 arguments:
   url: string # The URL for which to fetch data
-  mode: string # The type of crawlers (default: "scrape")
   params: dict # (Optional) The parameters for the Spider API
+  content_type: string # (Optional) The content type to return. Default is "application/json". Other options: "text/csv", "application/xml", "application/jsonl"
 
 output:
-  documents: list # The documents returned from the spider
+  result: list[dict] # A list of results, each containing: content, error, status, costs, url
 ```
 
 </td>
@@ -1452,7 +1452,7 @@ arguments:
   base64: boolean # Whether the input file is base64 encoded. Default is false.
 
 output:
-  documents: list # The parsed data from the document
+  documents: list[Document] # A list of parsed documents
 ```
 
 </td>
@@ -1520,7 +1520,7 @@ arguments:
   sort_order: string # The sort order for the results, options: ascending, descending
 
 output:
-  result: list # A list of search results, each containing: entry_id, title, updated, published, authors, summary, comment, journal_ref, doi, primary_category, categories, links, pdf_url, pdf_downloaded
+  result: list[dict] # A list of search results, each containing: entry_id, title, updated, published, authors, summary, comment, journal_ref, doi, primary_category, categories, links, pdf_url, pdf_downloaded
 ```
 
 </td>

diff --git a/agents-api/agents_api/autogen/Tools.py b/agents-api/agents_api/autogen/Tools.py
@@ -1639,9 +1639,11 @@ class SpiderFetchArguments(BaseModel):
     """
     The URL to fetch data from
     """
-    mode: Literal["crawl", "scrape"] = "scrape"
+    content_type: Literal[
+        "application/json", "text/csv", "application/xml", "application/jsonl"
+    ] = "application/json"
     """
-    The type of crawler to use
+    The content type to return
     """
     params: dict[str, Any] | None = None
     """
@@ -1661,9 +1663,11 @@ class SpiderFetchArgumentsUpdate(BaseModel):
     """
     The URL to fetch data from
     """
-    mode: Literal["crawl", "scrape"] = "scrape"
+    content_type: Literal[
+        "application/json", "text/csv", "application/xml", "application/jsonl"
+    ] = "application/json"
     """
-    The type of crawler to use
+    The content type to return
     """
     params: dict[str, Any] | None = None
     """
@@ -1683,7 +1687,7 @@ class SpiderIntegrationDef(BaseIntegrationDef):
     """
     The provider must be "spider"
     """
-    method: str | None = None
+    method: Literal["crawl", "links", "screenshot", "search"] | None = None
     """
     The specific method of the integration to call
     """
@@ -1709,7 +1713,7 @@ class SpiderIntegrationDefUpdate(BaseIntegrationDefUpdate):
     """
     The provider must be "spider"
     """
-    method: str | None = None
+    method: Literal["crawl", "links", "screenshot", "search"] | None = None
     """
     The specific method of the integration to call
     """

diff --git a/integrations-service/integrations/autogen/Tools.py b/integrations-service/integrations/autogen/Tools.py
@@ -1639,9 +1639,11 @@ class SpiderFetchArguments(BaseModel):
     """
     The URL to fetch data from
     """
-    mode: Literal["crawl", "scrape"] = "scrape"
+    content_type: Literal[
+        "application/json", "text/csv", "application/xml", "application/jsonl"
+    ] = "application/json"
     """
-    The type of crawler to use
+    The content type to return
     """
     params: dict[str, Any] | None = None
     """
@@ -1661,9 +1663,11 @@ class SpiderFetchArgumentsUpdate(BaseModel):
     """
     The URL to fetch data from
     """
-    mode: Literal["crawl", "scrape"] = "scrape"
+    content_type: Literal[
+        "application/json", "text/csv", "application/xml", "application/jsonl"
+    ] = "application/json"
     """
-    The type of crawler to use
+    The content type to return
     """
     params: dict[str, Any] | None = None
     """
@@ -1683,7 +1687,7 @@ class SpiderIntegrationDef(BaseIntegrationDef):
     """
     The provider must be "spider"
     """
-    method: str | None = None
+    method: Literal["crawl", "links", "screenshot", "search"] | None = None
     """
     The specific method of the integration to call
     """
@@ -1709,7 +1713,7 @@ class SpiderIntegrationDefUpdate(BaseIntegrationDefUpdate):
     """
     The provider must be "spider"
     """
-    method: str | None = None
+    method: Literal["crawl", "links", "screenshot", "search"] | None = None
     """
     The specific method of the integration to call
     """

diff --git a/integrations-service/integrations/models/__init__.py b/integrations-service/integrations/models/__init__.py
@@ -53,6 +53,11 @@
 from .ffmpeg import FfmpegSearchOutput as FfmpegSearchOutput
 from .llama_parse import LlamaParseFetchOutput as LlamaParseFetchOutput
 from .remote_browser import RemoteBrowserOutput as RemoteBrowserOutput
-from .spider import SpiderFetchOutput as SpiderFetchOutput
+from .spider import (
+    SpiderOutput as SpiderOutput,
+)
+from .spider import (
+    SpiderResponse as SpiderResponse,
+)
 from .weather import WeatherGetOutput as WeatherGetOutput
 from .wikipedia import WikipediaSearchOutput as WikipediaSearchOutput
diff --git a/integrations-service/integrations/models/execution.py b/integrations-service/integrations/models/execution.py
@@ -50,7 +50,7 @@
 from .ffmpeg import FfmpegSearchOutput
 from .llama_parse import LlamaParseFetchOutput
 from .remote_browser import RemoteBrowserOutput
-from .spider import SpiderFetchOutput
+from .spider import SpiderOutput
 from .weather import WeatherGetOutput
 from .wikipedia import WikipediaSearchOutput
 
@@ -98,7 +98,6 @@ class ExecutionError(BaseModel):
 ]
 
 ExecutionResponse = Union[
-    SpiderFetchOutput,
     WeatherGetOutput,
     EmailOutput,
     WikipediaSearchOutput,
@@ -118,6 +117,7 @@ class ExecutionError(BaseModel):
     CloudinaryUploadOutput,
     ExecutionError,
     ArxivSearchOutput,
+    SpiderOutput,
 ]
 
 

diff --git a/integrations-service/integrations/models/spider.py b/integrations-service/integrations/models/spider.py
@@ -1,10 +1,19 @@
-from langchain_core.documents import Document
-from pydantic import Field
+from typing import Any, List, Optional
+
+from pydantic import BaseModel, Field
 
 from .base_models import BaseOutput
 
 
-class SpiderFetchOutput(BaseOutput):
-    documents: list[Document] = Field(
-        ..., description="The documents returned from the spider"
+class SpiderResponse(BaseModel):
+    content: Optional[str] = None
+    error: Optional[str] = None
+    status: Optional[int] = None
+    costs: Optional[dict[Any, Any]] = None
+    url: Optional[str] = None
+
+
+class SpiderOutput(BaseOutput):
+    result: List[SpiderResponse] = Field(
+        ..., description="The responses from the spider"
     )
diff --git a/integrations-service/integrations/providers.py b/integrations-service/integrations/providers.py
@@ -3,7 +3,6 @@
     # Arguments imports
     BraveSearchArguments,
     # Setup imports
-    # Setup imports
     BraveSearchSetup,
     BrowserbaseCompleteSessionArguments,
     BrowserbaseCreateSessionArguments,
@@ -48,7 +47,7 @@
     LlamaParseFetchOutput,
     ProviderInfo,
     RemoteBrowserOutput,
-    SpiderFetchOutput,
+    SpiderOutput,
     WeatherGetOutput,
     WikipediaSearchOutput,
 )
@@ -99,7 +98,25 @@
             method="crawl",
             description="Crawl a website and extract data",
             arguments=SpiderFetchArguments,
-            output=SpiderFetchOutput,
+            output=SpiderOutput,
+        ),
+        BaseProviderMethod(
+            method="links",
+            description="Extract all links from the webpage",
+            arguments=SpiderFetchArguments,
+            output=SpiderOutput,
+        ),
+        BaseProviderMethod(
+            method="screenshot",
+            description="Take a screenshot of the webpage",
+            arguments=SpiderFetchArguments,
+            output=SpiderOutput,
+        ),
+        BaseProviderMethod(
+            method="search",
+            description="Search content within the webpage",
+            arguments=SpiderFetchArguments,
+            output=SpiderOutput,
         ),
     ],
     info=ProviderInfo(