-
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
simply sitemap tool -- ai isn't understanding. spoon feed sitemap urls
- Loading branch information
1 parent
e41e3a0
commit b11704e
Showing
7 changed files
with
45 additions
and
60 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
import pydantic | ||
from pydantic import BaseModel | ||
|
||
from app.helpers import render | ||
from app.helpers.scrape import fetch_sitemap_links, get_robots_txt, is_url_allowed | ||
from app.tools.base import AssistantTool | ||
|
||
|
||
class FetchSitemapParams(BaseModel): | ||
sitemap_url: str = pydantic.Field(..., description="") | ||
|
||
|
||
class FetchSitemap(AssistantTool): | ||
name: str = "web_fetch_sitemap" | ||
friendly_name: str = "Website Links Finder" | ||
description: str = render.render_template("fetch_sitemap_description.jinja2") | ||
parameters = FetchSitemapParams | ||
|
||
def run(self, **kwargs) -> str: | ||
input_url = kwargs.get("sitemap_url", "") | ||
result = {} | ||
|
||
robots_txt = get_robots_txt(input_url) | ||
if not is_url_allowed(input_url, robots_txt): | ||
raise ValueError(f"Scraping is disallowed by {input_url}") | ||
|
||
if "sitemap.xml" not in input_url: | ||
raise ValueError("URL does not appear to be a sitemap. Please try again.") | ||
links = fetch_sitemap_links(input_url) | ||
result[input_url] = links | ||
|
||
markdown = "" | ||
for sitemap, links in result.items(): | ||
markdown += f"## Sitemap Links for: {sitemap}\n" | ||
for link in links: | ||
markdown += f"- {link}\n" | ||
markdown += "\n" | ||
return markdown |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Retrieve a website's sitemap.xml file and use it to navigate the website's structure |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1 @@ | ||
This tool fetches and returns the readable content from a specified URL | ||
Verify url with sitemap_crawler before calling this tool | ||
.xml is not a valid URL |
This file was deleted.
Oops, something went wrong.