simply sitemap tool -- ai isn't understanding. spoon feed sitemap urls

darecstowell · Jan 14, 2025 · b11704e · b11704e
1 parent e41e3a0
commit b11704e
Show file tree

Hide file tree

Showing 7 changed files with 45 additions and 60 deletions.
diff --git a/app/main.py b/app/main.py
@@ -16,7 +16,7 @@
 from app.helpers.events import EventHandler
 from app.helpers.render import render_template
 from app.settings import DATABASE_URL, OPENAI_API_KEY, OPENAI_MODEL
-from app.tools import load_page_content, sitemap_crawler, wiki_page, wiki_search
+from app.tools import fetch_sitemap, load_page_content, wiki_page, wiki_search
 
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -28,7 +28,7 @@
 tools_list = [
     wiki_search.WikiSearch(),  # type: ignore
     wiki_page.WikiPage(),  # type: ignore
-    sitemap_crawler.SitemapCrawler(),  # type: ignore
+    fetch_sitemap.FetchSitemap(),  # type: ignore
     load_page_content.LoadPageContent(),  # type: ignore
 ]
 # TODO: this is currently creating a new assistant every time the app is restarted

diff --git a/app/tools/fetch_sitemap.py b/app/tools/fetch_sitemap.py
@@ -0,0 +1,38 @@
+import pydantic
+from pydantic import BaseModel
+
+from app.helpers import render
+from app.helpers.scrape import fetch_sitemap_links, get_robots_txt, is_url_allowed
+from app.tools.base import AssistantTool
+
+
+class FetchSitemapParams(BaseModel):
+    sitemap_url: str = pydantic.Field(..., description="")
+
+
+class FetchSitemap(AssistantTool):
+    name: str = "web_fetch_sitemap"
+    friendly_name: str = "Website Links Finder"
+    description: str = render.render_template("fetch_sitemap_description.jinja2")
+    parameters = FetchSitemapParams
+
+    def run(self, **kwargs) -> str:
+        input_url = kwargs.get("sitemap_url", "")
+        result = {}
+
+        robots_txt = get_robots_txt(input_url)
+        if not is_url_allowed(input_url, robots_txt):
+            raise ValueError(f"Scraping is disallowed by {input_url}")
+
+        if "sitemap.xml" not in input_url:
+            raise ValueError("URL does not appear to be a sitemap. Please try again.")
+        links = fetch_sitemap_links(input_url)
+        result[input_url] = links
+
+        markdown = ""
+        for sitemap, links in result.items():
+            markdown += f"## Sitemap Links for: {sitemap}\n"
+            for link in links:
+                markdown += f"- {link}\n"
+            markdown += "\n"
+        return markdown
diff --git a/app/tools/sitemap_crawler.py b/app/tools/sitemap_crawler.py
diff --git a/templates/agent_instructions.jinja2 b/templates/agent_instructions.jinja2
@@ -20,8 +20,9 @@ You must reject and explicitly state limitations when requested to process or an
 You must always use a tool before providing information
 Think about your plan step-by-step and communicate your approach as a brief outline before executing
 When searching the wiki, only use the wiki tools
-When searching the web, only use the web search tools
+When searching the web, only use the web tools
 When you are unable to answer a question via the wiki tools, continue your search on the web in this order without confirmation:
-1. https://maxroll.gg
-2. https://mobalytics.gg
+{# TODO: these likely won't change, but should add a get_website_root_sitemap tool #}
+1. https://maxroll.gg/poe2/sitemap.xml
+2. https://mobalytics.gg/poe-2/sitemap.xml
 Once you have exhausted all search options, ask the player clarifying questions or admit failure
diff --git a/templates/fetch_sitemap_description.jinja2 b/templates/fetch_sitemap_description.jinja2
@@ -0,0 +1 @@
+Retrieve a website's sitemap.xml file and use it to navigate the website's structure
diff --git a/templates/load_page_content_description.jinja2 b/templates/load_page_content_description.jinja2
@@ -1,3 +1 @@
 This tool fetches and returns the readable content from a specified URL
-Verify url with sitemap_crawler before calling this tool
-.xml is not a valid URL
diff --git a/templates/sitemap_crawler_description.jinja2 b/templates/sitemap_crawler_description.jinja2
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Retrieve a website's sitemap.xml file and use it to navigate the website's structure