Skip to content

Commit

Permalink
Update: optimized dependencies
Browse files Browse the repository at this point in the history
- pandas was used to load the dataset web.json in store.py; now the dataset schema is changed and it is loaded without pandas
- requests was used in web_search.py tool, since ollama has httpx as dependency I used it (httpx) also in the web search tool
- spacy is only used for chunking, even if this can't be removed only basic spacy functionality is needed, so switched to spacy[lookups]
- rich was included in the requirements-api.txt even if it wasn't used in the backend
  • Loading branch information
antoninoLorenzo committed Nov 30, 2024
1 parent 9f05141 commit 524166d
Show file tree
Hide file tree
Showing 12 changed files with 121 additions and 443 deletions.
2 changes: 1 addition & 1 deletion datasets/web/web.json

Large diffs are not rendered by default.

21 changes: 8 additions & 13 deletions requirements-api.txt
Original file line number Diff line number Diff line change
@@ -1,17 +1,12 @@
fastapi~=0.111.0
ollama~=0.3.1
qdrant-client~=1.9.0
spacy~=3.7.5
uvicorn
python-dotenv~=1.0.1
requests~=2.32.3
rich~=13.7.1
numpy~=1.26.4
fastapi[standard]
ollama
qdrant-client
python-dotenv
pydantic_settings
httpx
tool-parse
docker
pandas~=2.2.2
tqdm~=4.66.4
newspaper3k
psutil
lxml_html_clean
spacy[lookups]
psutil
uvicorn
7 changes: 6 additions & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@ isort
pylint
google-generativeai
pandas~=2.2.2
numpy
matplotlib~=3.9.0
seaborn~=0.13.2
tqdm~=4.66.4
tqdm~=4.66.4
prometheus_client
deepeval
playwright
lxml_html_clean
1 change: 0 additions & 1 deletion src/agent/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
LLM,
AVAILABLE_PROVIDERS,
TOOL_REGISTRY,
Memory
)
from src.agent.agent import Agent, AgentArchitecture
from src.agent.architectures import init_default_architecture
Expand Down
2 changes: 0 additions & 2 deletions src/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
Memory
)
from src.core.tools import (
ExploitDB,
Terminal,
Search,
TOOL_REGISTRY
)
Expand Down
65 changes: 34 additions & 31 deletions src/core/knowledge/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,9 @@
import httpx
import ollama
import spacy
import pandas as pd
import qdrant_client.http.exceptions
from qdrant_client import QdrantClient, models
from qdrant_client.http.exceptions import UnexpectedResponse
from tqdm import tqdm

from src.core.llm.llm import ProviderError
from src.core.knowledge.collections import Collection, Document, Topic
Expand All @@ -23,13 +21,14 @@ class Store:
"""Act as interface for Qdrant database.
Manages Collections and implements the Upload/Retrieve operations."""

def __init__(self,
base_path: str,
embedding_url: str = 'http://localhost:11434',
embedding_model: str = 'nomic-embed-text',
url: str = 'http://localhost:6333',
in_memory: bool = False,
):
def __init__(
self,
base_path: str,
embedding_url: str = 'http://localhost:11434',
embedding_model: str = 'nomic-embed-text',
url: str = 'http://localhost:6333',
in_memory: bool = False,
):
"""
:param embedding_url:
The url of the Ollama server.
Expand Down Expand Up @@ -83,8 +82,11 @@ def __init__(self,
except (httpx.ConnectError, ollama._types.ResponseError) as err:
raise ProviderError("Can't load embedding model") from err

def create_collection(self, collection: Collection,
progress_bar: bool = False):
def create_collection(
self,
collection: Collection,
progress_bar: bool = False
):
"""Creates a new Qdrant collection, uploads the collection documents
using `upload` and creates a metadata file for collection."""
if collection.title in self.collections:
Expand All @@ -103,16 +105,8 @@ def create_collection(self, collection: Collection,

# upload documents (if present)
self._collections[collection.title] = collection
if progress_bar:
for document in tqdm(
collection.documents,
total=len(collection.documents),
desc=f"Uploading {collection.title}"
):
self.upload(document, collection.title)
else:
for document in collection.documents:
self.upload(document, collection.title)
for document in collection.documents:
self.upload(document, collection.title)

# should do logging
# print(f'Collection {collection.title}: '
Expand All @@ -122,7 +116,11 @@ def create_collection(self, collection: Collection,
if not self.in_memory:
self.save_metadata(collection)

def upload(self, document: Document, collection_name: str):
def upload(
self,
document: Document,
collection_name: str
):
"""Performs chunking and embedding of a document
and uploads it to the specified collection"""
if not isinstance(collection_name, str):
Expand Down Expand Up @@ -162,9 +160,13 @@ def upload(self, document: Document, collection_name: str):
# self._collections[collection_name].documents.append(document)
self._collections[collection_name].size = current_len + len(emb_chunks)

def retrieve_from(self, query: str, collection_name: str,
limit: int = 3,
threshold: int = 0.5) -> list[str] | None:
def retrieve_from(
self,
query: str,
collection_name: str,
limit: int = 3,
threshold: int = 0.5
) -> list[str] | None:
"""Performs retrieval of chunks from the vector database.
:param query:
A natural language query used to search in the vector database.
Expand Down Expand Up @@ -254,15 +256,16 @@ def get_available_datasets() -> list[Collection]:
p.unlink()
continue

df = pd.read_json(p)
with open(p, 'r', encoding='utf-8') as fp:
data = json.load(fp)

topics = []
documents = []

for _, row in df.iterrows():
topic = Topic(row['category'])
for item in data:
topic = Topic(item['category'])
document = Document(
name=row['title'],
content=row['content'],
name=item['title'],
content=item['content'],
topic=topic
)
topics.append(topic)
Expand Down
4 changes: 2 additions & 2 deletions src/core/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

from tool_parse import ToolRegistry

from src.core.tools.exploit_db import ExploitDB
# from src.core.tools.exploit_db import ExploitDB
from src.core.tools.web_search import Search
from src.core.tools.terminal import Terminal
# from src.core.tools.terminal import Terminal

TOOL_REGISTRY = ToolRegistry()
SEARCH = Search()
Expand Down
37 changes: 18 additions & 19 deletions src/core/tools/web_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,9 @@
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed

import requests
import httpx
import newspaper
from bs4 import BeautifulSoup
from tool_parse import ToolRegistry

from src.utils import get_logger

Expand All @@ -23,10 +22,10 @@ class Search:
usage: str = "Make an online search using a query string."

def __init__(
self,
headers: dict = None,
max_results: int = 3,
num_threads: int = 3
self,
headers: dict = None,
max_results: int = 3,
num_threads: int = 3
):
"""
:param headers: HTTP headers to use for requests. Defaults to a basic
Expand Down Expand Up @@ -72,7 +71,7 @@ def run(self, search_query: str) -> str:

if len(links) == 1:
title, content, _ = self.__parse(links[0])
results = [f"# {title}\n{content}"]
results = [f"# {title} ({links[0]})\n{content}"]
else:
with ThreadPoolExecutor(max_workers=self.num_threads) as executor:
futures = [
Expand All @@ -81,17 +80,17 @@ def run(self, search_query: str) -> str:
]

for future in as_completed(futures):
title, content, _ = future.result()
title, content, _, link = future.result()
if title and content:
results.append(f"> Page: {title}\n{content}")
results.append(f"# {title} ({link})\n{content}")

return '\n\n'.join(results)

def __google_search(
self,
search_query,
results=3,
timeout: int = 3
self,
search_query,
results=3,
timeout: int = 3
) -> list:
"""
Conducts a Google search and retrieves links from the result page.
Expand All @@ -102,7 +101,7 @@ def __google_search(
:returns: a list of links."""
try:
response = requests.get(
response: httpx.Response = httpx.get(
url="https://www.google.com/search",
headers=self.headers,
params={
Expand All @@ -114,8 +113,9 @@ def __google_search(
},
timeout=timeout,
)
response.raise_for_status()
except requests.HTTPError as req_err:
if 400 <= response.status_code < 600:
raise httpx.HTTPError(f'{response.status_code}: {response.reason_phrase}')
except httpx.HTTPError as req_err:
print(f'[!] Error: {req_err}')
return []

Expand Down Expand Up @@ -150,7 +150,7 @@ def __exclude(self, link: str):
def __parse(self, link: str) -> tuple:
"""Downloads a web page and parses it with `newspaper3k` library.
:returns: tuple(title: str, content: str, tags: list)"""
:returns: tuple(title: str, content: str, tags: list, link: str)"""
page = newspaper.Article(
link,
headers=self.headers,
Expand All @@ -161,7 +161,7 @@ def __parse(self, link: str) -> tuple:
page.parse()
except newspaper.ArticleException:
return '', '', []
return page.title, page.text, page.tags
return page.title, page.text, page.tags, link

@staticmethod
def __user_agent() -> str:
Expand All @@ -172,4 +172,3 @@ def __user_agent() -> str:
# yeah, fuck apple
)
return available[random.randint(0, len(available) - 1)]

File renamed without changes.
52 changes: 52 additions & 0 deletions test/architecture/datasets/default_architecture.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
[
{
"category": "Web",
"activity": "code_writing",
"input": "While experimenting with SQLi I found the following ways to bypass a client-side filter:\nOR: OORR\nAND: AANDND\nSELECT: SESELECTLECT\nUNION: UNUNIONION\nORDER BY: OORRDER BY\n\nWrite the tamper script for sqlmap to use this filter bypass techniques.",
"router": {
"default": 1,
"reasoning": 1,
"tool": 0
}
},
{
"category": "Web",
"activity": "tool_usage",
"input": "I am doing a CTF, i recorded this request with burp:\nPOST /adminLogin007.php HTTP/1.1\nHost: 10.10.232.63\nUser-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8\nAccept-Language: en-US,en;q=0.5\nAccept-Encoding: gzip, deflate, br\nContent-Type: application/x-www-form-urlencoded\nContent-Length: 36\nOrigin: http://10.10.232.63\nConnection: keep-alive\nReferer: http://10.10.232.63/adminLogin007.php\nCookie: PHPSESSID=ooi4eckq3k748ed6ed2f6m0bhv\nUpgrade-Insecure-Requests: 1\n\nmail=dev%40injectics.thm&pass=123456\n\nI want to brute force the password, the succcessufll logi nshould redirect me to a admin dashboard, when wrong password is used I can see a div with class alert alert-danger and \"Invalid email or password.\" as content. Write the hydra command.",
"router": {
"default": 1,
"reasoning": 1,
"tool": 0
}
},
{
"category": "Web",
"activity": "general",
"input": "in a CTF, once I gained access to JetBrains TeamCity, I need to get remote access (ex. via RCE). Can you find a vulnerability?",
"router": {
"default": 0,
"reasoning": 0,
"tool": 1
}
},
{
"category": "OS",
"activity": "code_reading",
"input": "I can execute the following script as sudo, however I can't edit it. \n```\necho \"Enter your feedback:\"\nread feedback\n\n\nif [[ \"$feedback\" != *\"\\`\"* && \"$feedback\" != *\")\"* && \"$feedback\" != *\"\\$(\"* && \"$feedback\" != *\"|\"* && \"$feedback\" != *\"&\"* && \"$feedback\" != *\";\"* && \"$feedback\" != *\"?\"* && \"$feedback\" != *\"!\"* && \"$feedback\" != *\"\\\\\"* ]]; then\n echo \"It is This:\"\n eval \"echo $feedback\"\n\n echo \"$feedback\" >> /var/log/feedback.txt\n echo \"Feedback successfully saved.\"\nelse\n echo \"Invalid input. Please provide a valid input.\" \nfi\n```\n\nIs there any way I can exploit it?",
"router": {
"default": 0,
"reasoning": 1,
"tool": 0
}
},
{
"category": "OS",
"activity": "general",
"input": "with this information how can I do privilege escalation?\n\n**Permissions**\nUser jake may run the following commands on smag:\n (ALL : ALL) NOPASSWD: /usr/bin/apt-get\n \n\n**PATH**\n/home/jake/bin:/home/jake/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games\n- user home and home/bin is under path",
"router": {
"default": 0,
"reasoning": 1,
"tool": 1
}
}
]
Loading

0 comments on commit 524166d

Please sign in to comment.