Skip to content

Commit

Permalink
simply sitemap tool -- ai isn't understanding. spoon feed sitemap urls
Browse files Browse the repository at this point in the history
  • Loading branch information
darecstowell committed Jan 14, 2025
1 parent e41e3a0 commit b11704e
Show file tree
Hide file tree
Showing 7 changed files with 45 additions and 60 deletions.
4 changes: 2 additions & 2 deletions app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from app.helpers.events import EventHandler
from app.helpers.render import render_template
from app.settings import DATABASE_URL, OPENAI_API_KEY, OPENAI_MODEL
from app.tools import load_page_content, sitemap_crawler, wiki_page, wiki_search
from app.tools import fetch_sitemap, load_page_content, wiki_page, wiki_search

# Configure logging
logging.basicConfig(level=logging.INFO)
Expand All @@ -28,7 +28,7 @@
tools_list = [
wiki_search.WikiSearch(), # type: ignore
wiki_page.WikiPage(), # type: ignore
sitemap_crawler.SitemapCrawler(), # type: ignore
fetch_sitemap.FetchSitemap(), # type: ignore
load_page_content.LoadPageContent(), # type: ignore
]
# TODO: this is currently creating a new assistant every time the app is restarted
Expand Down
38 changes: 38 additions & 0 deletions app/tools/fetch_sitemap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import pydantic
from pydantic import BaseModel

from app.helpers import render
from app.helpers.scrape import fetch_sitemap_links, get_robots_txt, is_url_allowed
from app.tools.base import AssistantTool


class FetchSitemapParams(BaseModel):
sitemap_url: str = pydantic.Field(..., description="")


class FetchSitemap(AssistantTool):
name: str = "web_fetch_sitemap"
friendly_name: str = "Website Links Finder"
description: str = render.render_template("fetch_sitemap_description.jinja2")
parameters = FetchSitemapParams

def run(self, **kwargs) -> str:
input_url = kwargs.get("sitemap_url", "")
result = {}

robots_txt = get_robots_txt(input_url)
if not is_url_allowed(input_url, robots_txt):
raise ValueError(f"Scraping is disallowed by {input_url}")

if "sitemap.xml" not in input_url:
raise ValueError("URL does not appear to be a sitemap. Please try again.")
links = fetch_sitemap_links(input_url)
result[input_url] = links

markdown = ""
for sitemap, links in result.items():
markdown += f"## Sitemap Links for: {sitemap}\n"
for link in links:
markdown += f"- {link}\n"
markdown += "\n"
return markdown
50 changes: 0 additions & 50 deletions app/tools/sitemap_crawler.py

This file was deleted.

7 changes: 4 additions & 3 deletions templates/agent_instructions.jinja2
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ You must reject and explicitly state limitations when requested to process or an
You must always use a tool before providing information
Think about your plan step-by-step and communicate your approach as a brief outline before executing
When searching the wiki, only use the wiki tools
When searching the web, only use the web search tools
When searching the web, only use the web tools
When you are unable to answer a question via the wiki tools, continue your search on the web in this order without confirmation:
1. https://maxroll.gg
2. https://mobalytics.gg
{# TODO: these likely won't change, but should add a get_website_root_sitemap tool #}
1. https://maxroll.gg/poe2/sitemap.xml
2. https://mobalytics.gg/poe-2/sitemap.xml
Once you have exhausted all search options, ask the player clarifying questions or admit failure
1 change: 1 addition & 0 deletions templates/fetch_sitemap_description.jinja2
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Retrieve a website's sitemap.xml file and use it to navigate the website's structure
2 changes: 0 additions & 2 deletions templates/load_page_content_description.jinja2
Original file line number Diff line number Diff line change
@@ -1,3 +1 @@
This tool fetches and returns the readable content from a specified URL
Verify url with sitemap_crawler before calling this tool
.xml is not a valid URL
3 changes: 0 additions & 3 deletions templates/sitemap_crawler_description.jinja2

This file was deleted.

0 comments on commit b11704e

Please sign in to comment.