Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

0.0.5 Release: Dispute System #5

Merged
merged 2 commits into from
Aug 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .eslintignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
dispute/

# IDE
.idea/
.vscode/
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/pre-merge-checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ on:
push:
branches: ["main"]
pull_request:
branches: [ main, master ]

jobs:
checkout-install:
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,6 @@ Thumbs.db

# Logs
*.log

logs/
__pycache__/
2 changes: 2 additions & 0 deletions .prettierignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
dispute/

# IDE
.idea/
.vscode/
Expand Down
7 changes: 7 additions & 0 deletions dispute/.env.sample
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
LLM_BASE_URL=http://host.docker.internal:11434/v1
LLM_API_KEY=ollama
MODELS=llama3.1:8b,qwen:4b,mistral:7b
SIMULATION_MODEL=llama3.1:8b
ORCHESTRATOR_URL=https://orchestrator.chasm.net
WEBHOOK_API_KEY=
MIN_CONFIDENCE_SCORE=0.5
3 changes: 3 additions & 0 deletions dispute/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
venv
__pycache__
processed_histories.json
38 changes: 38 additions & 0 deletions dispute/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Use the official Python image from the Docker Hub
FROM python:3.10-slim-bullseye

# Install build dependencies
RUN apt-get update && apt-get install -y \
build-essential \
gcc \
g++ \
libopenblas-dev \
python3-dev \
cmake \
git \
curl \
&& apt-get clean

RUN pip install --upgrade pip

WORKDIR /usr/src/app

# Copy and install dependencies
COPY requirements.txt .
RUN pip install --upgrade pip
RUN pip install --no-cache-dir -r requirements.txt

# Copy application code
COPY . .

# Specify the command to run on container start

# Set the environment variable
ARG LLM_API_KEY
ENV LLM_API_KEY=${LLM_API_KEY}
# print
RUN echo "----------"
RUN echo $LLM_API_KEY
RUN if [ "$LLM_API_KEY" = "ollama" ]; then python3 verify_ollama.py; fi

CMD ["python", "main.py"]
31 changes: 31 additions & 0 deletions dispute/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
## Ollama

```
LLM_BASE_URL=http://host.docker.internal:11434/v1
LLM_API_KEY=ollama
MODELS=llama3.1:8b,qwen:4b,mistral:7b
SIMULATION_MODEL=llama3.1:8b
ORCHESTRATOR_URL=https://orchestrator.chasm.net
WEBHOOK_API_KEY=WlRFN20pc1IpMmlwZ2hBXk9ec1JWNkpzSzJjI2EqNHc
MIN_CONFIDENCE_SCORE=0.5
```

## OpenRouter

https://openrouter.ai/docs#models

```
LLM_API_KEY=
LLM_BASE_URL=https://openrouter.ai/api/v1
MODELS=google/gemma-7b-it,meta-llama/llama-3-8b-instruct,microsoft/wizardlm-2-7b,mistralai/mistral-7b-instruct-v0.3
SIMULATION_MODEL=meta-llama/llama-3-8b-instruct
```

## Groq

```
LLM_API_KEY=
LLM_BASE_URL=https://api.groq.com/openai/v1
MODELS=llama3-8b-8192,mixtral-8x7b-32768,gemma-7b-it
SIMULATION_MODEL=llama3-8b-8192
```
Empty file added dispute/__init__.py
Empty file.
27 changes: 27 additions & 0 deletions dispute/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import dotenv
dotenv.load_dotenv()

import asyncio
from util.chasm import ChasmConnection
from strategy import analyze_text

async def main():
connection = ChasmConnection()
histories = connection.get_benchmark_test()
i = 0
for history in histories:
print(f"--- {i} ---")
result = await analyze_text(history["input"], history["output"])
print(f"Result: {result}")
print(f"Score: {result['confidence_score']}")
print(f"Assert Check: {'✅' if result['correct'] == history['answer'] else '❌'}")
print(f"Dispute: {result['dispute']}")
i += 1
pass


if __name__ == "__main__":
# set TOKENIZERS_PARALLELISM
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
asyncio.run(main())
20 changes: 20 additions & 0 deletions dispute/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import os
from dotenv import load_dotenv

load_dotenv()

LLM_BASE_URL = os.getenv("LLM_BASE_URL")
LLM_API_KEY = os.getenv("LLM_API_KEY")
MODELS = os.getenv("MODELS", "gemma2-9b-it").split(",")
SIMULATION_MODEL = os.getenv("SIMULATION_MODEL", "gemma2-9b-it")
ORCHESTRATOR_URL = os.getenv("ORCHESTRATOR_URL")
WEBHOOK_API_KEY = os.getenv("WEBHOOK_API_KEY")
MIN_CONFIDENCE_SCORE = float(os.getenv("MIN_CONFIDENCE_SCORE", 0.5))
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")

print(f"LOG_LEVEL: {LOG_LEVEL}")
print(f"MIN_CONFIDENCE_SCORE: {MIN_CONFIDENCE_SCORE}")
print(f"LLM_BASE_URL: {LLM_BASE_URL}")
print(f"MODELS: {MODELS}")
print(f"SIMULATION_MODEL: {SIMULATION_MODEL}")
print(f"ORCHESTRATOR_URL: {ORCHESTRATOR_URL}")
19 changes: 19 additions & 0 deletions dispute/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
version: '3.8'

services:
app:
network_mode: "host"
build:
context: .
args:
LLM_API_KEY: ${LLM_API_KEY}
env_file:
- .env
ports:
- "11434:11434"
environment:
- LLM_API_KEY=${LLM_API_KEY}
- LLM_BASE_URL=${LLM_BASE_URL}
- MODELS=${MODELS}
- SIMULATION_MODEL=${SIMULATION_MODEL}
command: python3 main.py
66 changes: 66 additions & 0 deletions dispute/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from config import LOG_LEVEL, MIN_CONFIDENCE_SCORE
import asyncio
import os
from strategy import analyze_text
from util.chasm import ChasmConnection
import json
import logging

logging.basicConfig(
level=LOG_LEVEL,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)

chasm = ChasmConnection()
PROCESSED_HISTORIES_FILE = "processed_histories.json"

def load_processed_histories():
if os.path.exists(PROCESSED_HISTORIES_FILE):
with open(PROCESSED_HISTORIES_FILE, "r") as f:
return set(json.load(f))
return set()

def save_processed_histories(histories):
with open(PROCESSED_HISTORIES_FILE, "w") as f:
json.dump(list(histories), f)
processed_histories = load_processed_histories()

async def process_histories():
histories = chasm.get_prompt_history()
logging.info(f"Histories: {len(histories)}")
for history in histories:
if history["_id"] in processed_histories:
logging.debug(f"Skipping already processed history: {history['_id']}")
continue
input = map(lambda x: x["content"], history["messages"])
input = "\n".join(input)
output = history["result"]["choices"][0]["message"]["content"]
result = await analyze_text(input, output)
logging.debug(f"Result: {result}")
logging.debug(f"Score: {result['confidence_score']}")

if result["confidence_score"] > MIN_CONFIDENCE_SCORE:
response = chasm.file_dispute(
history["_id"],
history["messages"],
history["result"]["choices"][0]["message"],
)
logging.info("Dispute filed: ", response)

# Cache history
processed_histories.add(history["_id"])
save_processed_histories(processed_histories)



async def main():
while True:
await process_histories()
await asyncio.sleep(1)


if __name__ == "__main__":
# set TOKENIZERS_PARALLELISM
os.environ["TOKENIZERS_PARALLELISM"] = "true"
asyncio.run(main())
6 changes: 6 additions & 0 deletions dispute/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
openai==1.30.1
tiktoken==0.7.0
blobfile==2.1.1
scikit-learn==1.5.0
asyncio==3.4.3
python-dotenv==1.0.1
76 changes: 76 additions & 0 deletions dispute/strategies/LLMQuality.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import re
import logging
from util.poll import PollAlgo
from strategies.tokenizer import Tokenizer


class LLMQualityStrategy:
def __init__(self, models):

# llama3 tokenizer
self.tokenizer = Tokenizer()
self.poll = PollAlgo(models)

def format_text_limit(self, text: str, limit: int):
encoded_result = self.tokenizer.encode(text, bos=False, eos=False)
return self.tokenizer.decode(encoded_result[:limit])

async def analyze(self, input: str, output: str):
input = self.format_text_limit(input, 3000)
output = self.format_text_limit(output, 3000)

# Prompt from https://arxiv.org/abs/2306.05685v4
results = await self.poll.create_poll_completion(
messages=[
{
"role": "system",
"content": """You are judging whether a model has generated a correct answer to a question.
Study the examples the user gives you as they will be very informative for how to do the task.
Don’t worry about factuality with respect to the real world, just judge the example based on what you see.
No need to overthink this task, it really comes down to just soft matching.
If the Provided Answer is correct say exactly "True", otherwise say "False".
Only answer the question, do not provide supporting exposition.
""",
},
{
"role": "user",
"content": """Question: What element is the main constituent of anthracite?
Provided Answer: The main constituent of anthracite is carbon. High-grade anthracite, for example, has a chemical formula represented by CHONS, indicating it contains carbon, hydrogen, oxygen, nitrogen, and sulfur, with carbon making up 94% of its composition. Ultra high-grade (UHG) anthracite typically has a minimum carbon content of 95%, further emphasizing carbon as the primary element in anthracite.
Answer: Carbon
""",
},
{"role": "assistant", "content": "True"},
{
"role": "user",
"content": """Question: The religious order of Poor Ladies of San Damiano has what more common name?
Provided Answer: The religious order of Poor Ladies of San Damiano is more commonly known as the Poor Clares. This order was founded by Saints Clare of Assisi and Francis of Assisi in 1212, and it was the second Franciscan Order to be established. The Poor Clares were initially referred to as the Order of Poor Ladies, and later the Clarisses, the Minoresses, the Franciscan Clarist Order, and the Second Order of Saint Francis. However, the name Poor Clares is the most recognized and widely used today.
Answer: Franciscan Clarist Order
""",
},
{"role": "assistant", "content": "False"},
{"role": "user", "content": f"Question: {input}\nAnswer: {output}"},
],
)
logging.debug(results)
llm_results = []

for r in results:
assert r is not None, "LLM returned None"
r_lower = r.lower()
if "true" in r_lower and "false" in r_lower:
true_index = re.search(r"\btrue\b", r_lower).start()
false_index = re.search(r"\bfalse\b", r_lower).start()
llm_results.append(0 if true_index < false_index else 1)
elif "true" in r_lower:
llm_results.append(0)
elif "false" in r_lower:
llm_results.append(1)
elif "yes" in r_lower:
llm_results.append(0)
elif "no" in r_lower:
llm_results.append(1)
else:
llm_results.append(0.5)
logging.debug(f"LLM Results: {llm_results}")
score = sum(llm_results) / len(llm_results)
return score
35 changes: 35 additions & 0 deletions dispute/strategies/ResponseSimilarity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import asyncio
from util.llm import llm

class ResponseSimilarityAnalysis:
def __init__(self, model: str):
self.model = model

async def analyze(self, input: str, output: str):
loop = asyncio.get_event_loop()
score = await loop.run_in_executor(None, self._sync_analyse, input, output)
return score

def _sync_analyse(self, input: str, output: str):

simulated_output = llm.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": input},
],
model=self.model,
temperature=0.8,
)
simulated_output_response = simulated_output.choices[0].message.content
assert simulated_output_response is not None, "Simulated output is None"
score = self.similarity(simulated_output_response, output)
return score

def similarity(self, text1: str, text2: str) -> float:
vectorizer = TfidfVectorizer().fit_transform([text1, text2])
vectors = vectorizer.toarray()
cosine_sim = cosine_similarity(vectors[0:1], vectors[1:])
score = cosine_sim[0][0]
return 1 - score
19 changes: 19 additions & 0 deletions dispute/strategies/SemanticSimilarity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import asyncio
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class SemanticSimilarityAnalysis:
def __init__(self):
pass

async def analyze(self, input: str, output: str) -> float:
loop = asyncio.get_event_loop()
score = await loop.run_in_executor(None, self._sync_analyze, input, output)
return score

def _sync_analyze(self, input: str, output: str) -> float:
vectorizer = TfidfVectorizer().fit_transform([input, output])
vectors = vectorizer.toarray()
cosine_sim = cosine_similarity(vectors[0:1], vectors[1:])
score = cosine_sim[0][0]
return 1 - score
Loading
Loading