Update: optimized dependencies

- pandas was used to load the dataset web.json in store.py; now the dataset schema is changed and it is loaded without pandas - requests was used in web_search.py tool, since ollama has httpx as dependency I used it (httpx) also in the web search tool - spacy is only used for chunking, even if this can't be removed only basic spacy functionality is needed, so switched to spacy[lookups] - rich was included in the requirements-api.txt even if it wasn't used in the backend
antoninoLorenzo · Nov 30, 2024 · 524166d · 524166d
1 parent 9f05141
commit 524166d
Show file tree

Hide file tree

Showing 12 changed files with 121 additions and 443 deletions.
diff --git a/datasets/web/web.json b/datasets/web/web.json
diff --git a/requirements-api.txt b/requirements-api.txt
@@ -1,17 +1,12 @@
-fastapi~=0.111.0
-ollama~=0.3.1
-qdrant-client~=1.9.0
-spacy~=3.7.5
-uvicorn
-python-dotenv~=1.0.1
-requests~=2.32.3
-rich~=13.7.1
-numpy~=1.26.4
+fastapi[standard]
+ollama
+qdrant-client
+python-dotenv
 pydantic_settings
 httpx
 tool-parse
-docker
-pandas~=2.2.2
-tqdm~=4.66.4
 newspaper3k
-psutil
+lxml_html_clean
+spacy[lookups]
+psutil
+uvicorn
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -2,6 +2,11 @@ isort
 pylint
 google-generativeai
 pandas~=2.2.2
+numpy
 matplotlib~=3.9.0
 seaborn~=0.13.2
-tqdm~=4.66.4
+tqdm~=4.66.4
+prometheus_client
+deepeval
+playwright
+lxml_html_clean
diff --git a/src/agent/__init__.py b/src/agent/__init__.py
@@ -3,7 +3,6 @@
     LLM,
     AVAILABLE_PROVIDERS,
     TOOL_REGISTRY,
-    Memory
 )
 from src.agent.agent import Agent, AgentArchitecture
 from src.agent.architectures import init_default_architecture

diff --git a/src/core/__init__.py b/src/core/__init__.py
@@ -6,8 +6,6 @@
     Memory
 )
 from src.core.tools import (
-    ExploitDB,
-    Terminal,
     Search,
     TOOL_REGISTRY
 )

diff --git a/src/core/knowledge/store.py b/src/core/knowledge/store.py
@@ -6,11 +6,9 @@
 import httpx
 import ollama
 import spacy
-import pandas as pd
 import qdrant_client.http.exceptions
 from qdrant_client import QdrantClient, models
 from qdrant_client.http.exceptions import UnexpectedResponse
-from tqdm import tqdm
 
 from src.core.llm.llm import ProviderError
 from src.core.knowledge.collections import Collection, Document, Topic
@@ -23,13 +21,14 @@ class Store:
     """Act as interface for Qdrant database.
     Manages Collections and implements the Upload/Retrieve operations."""
 
-    def __init__(self,
-                 base_path: str,
-                 embedding_url: str = 'http://localhost:11434',
-                 embedding_model: str = 'nomic-embed-text',
-                 url: str = 'http://localhost:6333',
-                 in_memory: bool = False,
-                 ):
+    def __init__(
+        self,
+        base_path: str,
+        embedding_url: str = 'http://localhost:11434',
+        embedding_model: str = 'nomic-embed-text',
+        url: str = 'http://localhost:6333',
+        in_memory: bool = False,
+    ):
         """
         :param embedding_url:
             The url of the Ollama server.
@@ -83,8 +82,11 @@ def __init__(self,
         except (httpx.ConnectError, ollama._types.ResponseError) as err:
             raise ProviderError("Can't load embedding model") from err
 
-    def create_collection(self, collection: Collection,
-                          progress_bar: bool = False):
+    def create_collection(
+        self,
+        collection: Collection,
+        progress_bar: bool = False
+    ):
         """Creates a new Qdrant collection, uploads the collection documents
         using `upload` and creates a metadata file for collection."""
         if collection.title in self.collections:
@@ -103,16 +105,8 @@ def create_collection(self, collection: Collection,
 
         # upload documents (if present)
         self._collections[collection.title] = collection
-        if progress_bar:
-            for document in tqdm(
-                    collection.documents,
-                    total=len(collection.documents),
-                    desc=f"Uploading {collection.title}"
-            ):
-                self.upload(document, collection.title)
-        else:
-            for document in collection.documents:
-                self.upload(document, collection.title)
+        for document in collection.documents:
+            self.upload(document, collection.title)
 
         # should do logging
         # print(f'Collection {collection.title}: '
@@ -122,7 +116,11 @@ def create_collection(self, collection: Collection,
         if not self.in_memory:
             self.save_metadata(collection)
 
-    def upload(self, document: Document, collection_name: str):
+    def upload(
+        self,
+        document: Document,
+        collection_name: str
+    ):
         """Performs chunking and embedding of a document
         and uploads it to the specified collection"""
         if not isinstance(collection_name, str):
@@ -162,9 +160,13 @@ def upload(self, document: Document, collection_name: str):
         # self._collections[collection_name].documents.append(document)
         self._collections[collection_name].size = current_len + len(emb_chunks)
 
-    def retrieve_from(self, query: str, collection_name: str,
-                      limit: int = 3,
-                      threshold: int = 0.5) -> list[str] | None:
+    def retrieve_from(
+        self,
+        query: str,
+        collection_name: str,
+        limit: int = 3,
+        threshold: int = 0.5
+    ) -> list[str] | None:
         """Performs retrieval of chunks from the vector database.
         :param query:
             A natural language query used to search in the vector database.
@@ -254,15 +256,16 @@ def get_available_datasets() -> list[Collection]:
                 p.unlink()
                 continue
 
-            df = pd.read_json(p)
+            with open(p, 'r', encoding='utf-8') as fp:
+                data = json.load(fp)
+
             topics = []
             documents = []
-
-            for _, row in df.iterrows():
-                topic = Topic(row['category'])
+            for item in data:
+                topic = Topic(item['category'])
                 document = Document(
-                    name=row['title'],
-                    content=row['content'],
+                    name=item['title'],
+                    content=item['content'],
                     topic=topic
                 )
                 topics.append(topic)

diff --git a/src/core/tools/__init__.py b/src/core/tools/__init__.py
@@ -2,9 +2,9 @@
 
 from tool_parse import ToolRegistry
 
-from src.core.tools.exploit_db import ExploitDB
+# from src.core.tools.exploit_db import ExploitDB
 from src.core.tools.web_search import Search
-from src.core.tools.terminal import Terminal
+# from src.core.tools.terminal import Terminal
 
 TOOL_REGISTRY = ToolRegistry()
 SEARCH = Search()

diff --git a/src/core/tools/web_search.py b/src/core/tools/web_search.py
@@ -7,10 +7,9 @@
 from urllib.parse import urlparse
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
-import requests
+import httpx
 import newspaper
 from bs4 import BeautifulSoup
-from tool_parse import ToolRegistry
 
 from src.utils import get_logger
 
@@ -23,10 +22,10 @@ class Search:
     usage: str = "Make an online search using a query string."
 
     def __init__(
-            self,
-            headers: dict = None,
-            max_results: int = 3,
-            num_threads: int = 3
+        self,
+        headers: dict = None,
+        max_results: int = 3,
+        num_threads: int = 3
     ):
         """
         :param headers: HTTP headers to use for requests. Defaults to a basic
@@ -72,7 +71,7 @@ def run(self, search_query: str) -> str:
 
         if len(links) == 1:
             title, content, _ = self.__parse(links[0])
-            results = [f"# {title}\n{content}"]
+            results = [f"# {title} ({links[0]})\n{content}"]
         else:
             with ThreadPoolExecutor(max_workers=self.num_threads) as executor:
                 futures = [
@@ -81,17 +80,17 @@ def run(self, search_query: str) -> str:
                 ]
 
                 for future in as_completed(futures):
-                    title, content, _ = future.result()
+                    title, content, _, link = future.result()
                     if title and content:
-                        results.append(f"> Page: {title}\n{content}")
+                        results.append(f"# {title} ({link})\n{content}")
 
         return '\n\n'.join(results)
 
     def __google_search(
-            self,
-            search_query,
-            results=3,
-            timeout: int = 3
+        self,
+        search_query,
+        results=3,
+        timeout: int = 3
     ) -> list:
         """
         Conducts a Google search and retrieves links from the result page.
@@ -102,7 +101,7 @@ def __google_search(
 
         :returns: a list of links."""
         try:
-            response = requests.get(
+            response: httpx.Response = httpx.get(
                 url="https://www.google.com/search",
                 headers=self.headers,
                 params={
@@ -114,8 +113,9 @@ def __google_search(
                 },
                 timeout=timeout,
             )
-            response.raise_for_status()
-        except requests.HTTPError as req_err:
+            if 400 <= response.status_code < 600:
+                raise httpx.HTTPError(f'{response.status_code}: {response.reason_phrase}')
+        except httpx.HTTPError as req_err:
             print(f'[!] Error: {req_err}')
             return []
 
@@ -150,7 +150,7 @@ def __exclude(self, link: str):
     def __parse(self, link: str) -> tuple:
         """Downloads a web page and parses it with `newspaper3k` library.
 
-        :returns: tuple(title: str, content: str, tags: list)"""
+        :returns: tuple(title: str, content: str, tags: list, link: str)"""
         page = newspaper.Article(
             link,
             headers=self.headers,
@@ -161,7 +161,7 @@ def __parse(self, link: str) -> tuple:
             page.parse()
         except newspaper.ArticleException:
             return '', '', []
-        return page.title, page.text, page.tags
+        return page.title, page.text, page.tags, link
 
     @staticmethod
     def __user_agent() -> str:
@@ -172,4 +172,3 @@ def __user_agent() -> str:
             # yeah, fuck apple
         )
         return available[random.randint(0, len(available) - 1)]
-
diff --git a/test/architecture/utils.py → test/architecture/commons.py b/test/architecture/utils.py → test/architecture/commons.py
diff --git a/test/architecture/datasets/default_architecture.json b/test/architecture/datasets/default_architecture.json
@@ -0,0 +1,52 @@
+[
+    {
+        "category": "Web",
+        "activity": "code_writing",
+        "input":  "While experimenting with SQLi I found the following ways to bypass a client-side filter:\nOR: OORR\nAND: AANDND\nSELECT: SESELECTLECT\nUNION: UNUNIONION\nORDER BY: OORRDER BY\n\nWrite the tamper script for sqlmap to use this filter bypass techniques.",
+        "router": {
+            "default": 1,
+            "reasoning": 1,
+            "tool": 0
+        }
+    },
+    {
+        "category": "Web",
+        "activity": "tool_usage",
+        "input":  "I am doing a CTF, i recorded this request with burp:\nPOST /adminLogin007.php HTTP/1.1\nHost: 10.10.232.63\nUser-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8\nAccept-Language: en-US,en;q=0.5\nAccept-Encoding: gzip, deflate, br\nContent-Type: application/x-www-form-urlencoded\nContent-Length: 36\nOrigin: http://10.10.232.63\nConnection: keep-alive\nReferer: http://10.10.232.63/adminLogin007.php\nCookie: PHPSESSID=ooi4eckq3k748ed6ed2f6m0bhv\nUpgrade-Insecure-Requests: 1\n\nmail=dev%40injectics.thm&pass=123456\n\nI want to brute force the password, the succcessufll logi nshould redirect me to a admin dashboard, when wrong password is used I can see a div with class alert alert-danger and \"Invalid email or password.\" as content. Write the hydra command.",
+        "router": {
+            "default": 1,
+            "reasoning": 1,
+            "tool": 0
+        }
+    },
+    {
+        "category": "Web",
+        "activity": "general",
+        "input": "in a CTF, once I gained access to JetBrains TeamCity, I need to get remote access (ex. via RCE). Can you find a vulnerability?",
+        "router": {
+            "default": 0,
+            "reasoning": 0,
+            "tool": 1
+        }
+    },
+    {
+        "category": "OS",
+        "activity": "code_reading",
+        "input": "I can execute the following script as sudo, however I can't edit it. \n```\necho \"Enter your feedback:\"\nread feedback\n\n\nif [[ \"$feedback\" != *\"\\`\"* && \"$feedback\" != *\")\"* && \"$feedback\" != *\"\\$(\"* && \"$feedback\" != *\"|\"* && \"$feedback\" != *\"&\"* && \"$feedback\" != *\";\"* && \"$feedback\" != *\"?\"* && \"$feedback\" != *\"!\"* && \"$feedback\" != *\"\\\\\"* ]]; then\n    echo \"It is This:\"\n    eval \"echo $feedback\"\n\n    echo \"$feedback\" >> /var/log/feedback.txt\n    echo \"Feedback successfully saved.\"\nelse\n    echo \"Invalid input. Please provide a valid input.\" \nfi\n```\n\nIs there any way I can exploit it?",
+        "router": {
+            "default": 0,
+            "reasoning": 1,
+            "tool": 0
+        }
+    },
+    {
+        "category": "OS",
+        "activity": "general",
+        "input": "with this information how can I do privilege escalation?\n\n**Permissions**\nUser jake may run the following commands on smag:\n    (ALL : ALL) NOPASSWD: /usr/bin/apt-get\n    \n\n**PATH**\n/home/jake/bin:/home/jake/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games\n- user home and home/bin is under path",
+        "router": {
+            "default": 0,
+            "reasoning": 1,
+            "tool": 1
+        }
+    }
+]