Skip to content

Commit

Permalink
add dispute system (#1)
Browse files Browse the repository at this point in the history
  • Loading branch information
cloudre01 authored and superoo7 committed Aug 6, 2024
1 parent d0848b5 commit 912bd49
Show file tree
Hide file tree
Showing 28 changed files with 128,968 additions and 1 deletion.
2 changes: 2 additions & 0 deletions .eslintignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
dispute/

# IDE
.idea/
.vscode/
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/pre-merge-checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ on:
push:
branches: ["main"]
pull_request:
branches: [ main, master ]

jobs:
checkout-install:
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,6 @@ Thumbs.db

# Logs
*.log

logs/
__pycache__/
2 changes: 2 additions & 0 deletions .prettierignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
dispute/

# IDE
.idea/
.vscode/
Expand Down
7 changes: 7 additions & 0 deletions dispute/.env.sample
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
LLM_BASE_URL=http://host.docker.internal:11434/v1
LLM_API_KEY=ollama
MODELS=llama3.1:8b,qwen:4b,mistral:7b
SIMULATION_MODEL=llama3.1:8b
ORCHESTRATOR_URL=https://orchestrator.chasm.net
WEBHOOK_API_KEY=
MIN_CONFIDENCE_SCORE=0.5
3 changes: 3 additions & 0 deletions dispute/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
venv
__pycache__
processed_histories.json
38 changes: 38 additions & 0 deletions dispute/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Use the official Python image from the Docker Hub
FROM python:3.10-slim-bullseye

# Install build dependencies
RUN apt-get update && apt-get install -y \
build-essential \
gcc \
g++ \
libopenblas-dev \
python3-dev \
cmake \
git \
curl \
&& apt-get clean

RUN pip install --upgrade pip

WORKDIR /usr/src/app

# Copy and install dependencies
COPY requirements.txt .
RUN pip install --upgrade pip
RUN pip install --no-cache-dir -r requirements.txt

# Copy application code
COPY . .

# Specify the command to run on container start

# Set the environment variable
ARG LLM_API_KEY
ENV LLM_API_KEY=${LLM_API_KEY}
# print
RUN echo "----------"
RUN echo $LLM_API_KEY
RUN if [ "$LLM_API_KEY" = "ollama" ]; then python3 verify_ollama.py; fi

CMD ["python", "main.py"]
31 changes: 31 additions & 0 deletions dispute/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
## Ollama

```
LLM_BASE_URL=http://host.docker.internal:11434/v1
LLM_API_KEY=ollama
MODELS=llama3.1:8b,qwen:4b,mistral:7b
SIMULATION_MODEL=llama3.1:8b
ORCHESTRATOR_URL=https://orchestrator.chasm.net
WEBHOOK_API_KEY=WlRFN20pc1IpMmlwZ2hBXk9ec1JWNkpzSzJjI2EqNHc
MIN_CONFIDENCE_SCORE=0.5
```

## OpenRouter

https://openrouter.ai/docs#models

```
LLM_API_KEY=
LLM_BASE_URL=https://openrouter.ai/api/v1
MODELS=google/gemma-7b-it,meta-llama/llama-3-8b-instruct,microsoft/wizardlm-2-7b,mistralai/mistral-7b-instruct-v0.3
SIMULATION_MODEL=meta-llama/llama-3-8b-instruct
```

## Groq

```
LLM_API_KEY=
LLM_BASE_URL=https://api.groq.com/openai/v1
MODELS=llama3-8b-8192,mixtral-8x7b-32768,gemma-7b-it
SIMULATION_MODEL=llama3-8b-8192
```
Empty file added dispute/__init__.py
Empty file.
27 changes: 27 additions & 0 deletions dispute/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import dotenv
dotenv.load_dotenv()

import asyncio
from util.chasm import ChasmConnection
from strategy import analyze_text

async def main():
connection = ChasmConnection()
histories = connection.get_benchmark_test()
i = 0
for history in histories:
print(f"--- {i} ---")
result = await analyze_text(history["input"], history["output"])
print(f"Result: {result}")
print(f"Score: {result['confidence_score']}")
print(f"Assert Check: {'✅' if result['correct'] == history['answer'] else '❌'}")
print(f"Dispute: {result['dispute']}")
i += 1
pass


if __name__ == "__main__":
# set TOKENIZERS_PARALLELISM
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
asyncio.run(main())
20 changes: 20 additions & 0 deletions dispute/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import os
from dotenv import load_dotenv

load_dotenv()

LLM_BASE_URL = os.getenv("LLM_BASE_URL")
LLM_API_KEY = os.getenv("LLM_API_KEY")
MODELS = os.getenv("MODELS", "gemma2-9b-it").split(",")
SIMULATION_MODEL = os.getenv("SIMULATION_MODEL", "gemma2-9b-it")
ORCHESTRATOR_URL = os.getenv("ORCHESTRATOR_URL")
WEBHOOK_API_KEY = os.getenv("WEBHOOK_API_KEY")
MIN_CONFIDENCE_SCORE = float(os.getenv("MIN_CONFIDENCE_SCORE", 0.5))
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")

print(f"LOG_LEVEL: {LOG_LEVEL}")
print(f"MIN_CONFIDENCE_SCORE: {MIN_CONFIDENCE_SCORE}")
print(f"LLM_BASE_URL: {LLM_BASE_URL}")
print(f"MODELS: {MODELS}")
print(f"SIMULATION_MODEL: {SIMULATION_MODEL}")
print(f"ORCHESTRATOR_URL: {ORCHESTRATOR_URL}")
19 changes: 19 additions & 0 deletions dispute/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
version: '3.8'

services:
app:
network_mode: "host"
build:
context: .
args:
LLM_API_KEY: ${LLM_API_KEY}
env_file:
- .env
ports:
- "11434:11434"
environment:
- LLM_API_KEY=${LLM_API_KEY}
- LLM_BASE_URL=${LLM_BASE_URL}
- MODELS=${MODELS}
- SIMULATION_MODEL=${SIMULATION_MODEL}
command: python3 main.py
66 changes: 66 additions & 0 deletions dispute/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from config import LOG_LEVEL, MIN_CONFIDENCE_SCORE
import asyncio
import os
from strategy import analyze_text
from util.chasm import ChasmConnection
import json
import logging

logging.basicConfig(
level=LOG_LEVEL,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)

chasm = ChasmConnection()
PROCESSED_HISTORIES_FILE = "processed_histories.json"

def load_processed_histories():
if os.path.exists(PROCESSED_HISTORIES_FILE):
with open(PROCESSED_HISTORIES_FILE, "r") as f:
return set(json.load(f))
return set()

def save_processed_histories(histories):
with open(PROCESSED_HISTORIES_FILE, "w") as f:
json.dump(list(histories), f)
processed_histories = load_processed_histories()

async def process_histories():
histories = chasm.get_prompt_history()
logging.info(f"Histories: {len(histories)}")
for history in histories:
if history["_id"] in processed_histories:
logging.debug(f"Skipping already processed history: {history['_id']}")
continue
input = map(lambda x: x["content"], history["messages"])
input = "\n".join(input)
output = history["result"]["choices"][0]["message"]["content"]
result = await analyze_text(input, output)
logging.debug(f"Result: {result}")
logging.debug(f"Score: {result['confidence_score']}")

if result["confidence_score"] > MIN_CONFIDENCE_SCORE:
response = chasm.file_dispute(
history["_id"],
history["messages"],
history["result"]["choices"][0]["message"],
)
logging.info("Dispute filed: ", response)

# Cache history
processed_histories.add(history["_id"])
save_processed_histories(processed_histories)



async def main():
while True:
await process_histories()
await asyncio.sleep(1)


if __name__ == "__main__":
# set TOKENIZERS_PARALLELISM
os.environ["TOKENIZERS_PARALLELISM"] = "true"
asyncio.run(main())
6 changes: 6 additions & 0 deletions dispute/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
openai==1.30.1
tiktoken==0.7.0
blobfile==2.1.1
scikit-learn==1.5.0
asyncio==3.4.3
python-dotenv==1.0.1
76 changes: 76 additions & 0 deletions dispute/strategies/LLMQuality.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import re
import logging
from util.poll import PollAlgo
from strategies.tokenizer import Tokenizer


class LLMQualityStrategy:
def __init__(self, models):

# llama3 tokenizer
self.tokenizer = Tokenizer()
self.poll = PollAlgo(models)

def format_text_limit(self, text: str, limit: int):
encoded_result = self.tokenizer.encode(text, bos=False, eos=False)
return self.tokenizer.decode(encoded_result[:limit])

async def analyze(self, input: str, output: str):
input = self.format_text_limit(input, 3000)
output = self.format_text_limit(output, 3000)

# Prompt from https://arxiv.org/abs/2306.05685v4
results = await self.poll.create_poll_completion(
messages=[
{
"role": "system",
"content": """You are judging whether a model has generated a correct answer to a question.
Study the examples the user gives you as they will be very informative for how to do the task.
Don’t worry about factuality with respect to the real world, just judge the example based on what you see.
No need to overthink this task, it really comes down to just soft matching.
If the Provided Answer is correct say exactly "True", otherwise say "False".
Only answer the question, do not provide supporting exposition.
""",
},
{
"role": "user",
"content": """Question: What element is the main constituent of anthracite?
Provided Answer: The main constituent of anthracite is carbon. High-grade anthracite, for example, has a chemical formula represented by CHONS, indicating it contains carbon, hydrogen, oxygen, nitrogen, and sulfur, with carbon making up 94% of its composition. Ultra high-grade (UHG) anthracite typically has a minimum carbon content of 95%, further emphasizing carbon as the primary element in anthracite.
Answer: Carbon
""",
},
{"role": "assistant", "content": "True"},
{
"role": "user",
"content": """Question: The religious order of Poor Ladies of San Damiano has what more common name?
Provided Answer: The religious order of Poor Ladies of San Damiano is more commonly known as the Poor Clares. This order was founded by Saints Clare of Assisi and Francis of Assisi in 1212, and it was the second Franciscan Order to be established. The Poor Clares were initially referred to as the Order of Poor Ladies, and later the Clarisses, the Minoresses, the Franciscan Clarist Order, and the Second Order of Saint Francis. However, the name Poor Clares is the most recognized and widely used today.
Answer: Franciscan Clarist Order
""",
},
{"role": "assistant", "content": "False"},
{"role": "user", "content": f"Question: {input}\nAnswer: {output}"},
],
)
logging.debug(results)
llm_results = []

for r in results:
assert r is not None, "LLM returned None"
r_lower = r.lower()
if "true" in r_lower and "false" in r_lower:
true_index = re.search(r"\btrue\b", r_lower).start()
false_index = re.search(r"\bfalse\b", r_lower).start()
llm_results.append(0 if true_index < false_index else 1)
elif "true" in r_lower:
llm_results.append(0)
elif "false" in r_lower:
llm_results.append(1)
elif "yes" in r_lower:
llm_results.append(0)
elif "no" in r_lower:
llm_results.append(1)
else:
llm_results.append(0.5)
logging.debug(f"LLM Results: {llm_results}")
score = sum(llm_results) / len(llm_results)
return score
35 changes: 35 additions & 0 deletions dispute/strategies/ResponseSimilarity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import asyncio
from util.llm import llm

class ResponseSimilarityAnalysis:
def __init__(self, model: str):
self.model = model

async def analyze(self, input: str, output: str):
loop = asyncio.get_event_loop()
score = await loop.run_in_executor(None, self._sync_analyse, input, output)
return score

def _sync_analyse(self, input: str, output: str):

simulated_output = llm.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": input},
],
model=self.model,
temperature=0.8,
)
simulated_output_response = simulated_output.choices[0].message.content
assert simulated_output_response is not None, "Simulated output is None"
score = self.similarity(simulated_output_response, output)
return score

def similarity(self, text1: str, text2: str) -> float:
vectorizer = TfidfVectorizer().fit_transform([text1, text2])
vectors = vectorizer.toarray()
cosine_sim = cosine_similarity(vectors[0:1], vectors[1:])
score = cosine_sim[0][0]
return 1 - score
19 changes: 19 additions & 0 deletions dispute/strategies/SemanticSimilarity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import asyncio
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class SemanticSimilarityAnalysis:
def __init__(self):
pass

async def analyze(self, input: str, output: str) -> float:
loop = asyncio.get_event_loop()
score = await loop.run_in_executor(None, self._sync_analyze, input, output)
return score

def _sync_analyze(self, input: str, output: str) -> float:
vectorizer = TfidfVectorizer().fit_transform([input, output])
vectors = vectorizer.toarray()
cosine_sim = cosine_similarity(vectors[0:1], vectors[1:])
score = cosine_sim[0][0]
return 1 - score
Loading

0 comments on commit 912bd49

Please sign in to comment.