ChasmNetwork · superoo7 · Aug 6, 2024 · Aug 6, 2024 · Aug 6, 2024
diff --git a/.eslintignore b/.eslintignore
@@ -1,3 +1,5 @@
+dispute/
+
 # IDE
 .idea/
 .vscode/

diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml
@@ -4,7 +4,6 @@ on:
   push:
     branches: ["main"]
   pull_request:
-    branches: [ main, master ]
 
 jobs:
   checkout-install:

diff --git a/.gitignore b/.gitignore
@@ -19,3 +19,6 @@ Thumbs.db
 
 # Logs
 *.log
+
+logs/
+__pycache__/
diff --git a/.prettierignore b/.prettierignore
@@ -1,3 +1,5 @@
+dispute/
+
 # IDE
 .idea/
 .vscode/

diff --git a/dispute/.env.sample b/dispute/.env.sample
@@ -0,0 +1,7 @@
+LLM_BASE_URL=http://host.docker.internal:11434/v1
+LLM_API_KEY=ollama
+MODELS=llama3.1:8b,qwen:4b,mistral:7b
+SIMULATION_MODEL=llama3.1:8b
+ORCHESTRATOR_URL=https://orchestrator.chasm.net
+WEBHOOK_API_KEY=
+MIN_CONFIDENCE_SCORE=0.5
diff --git a/dispute/.gitignore b/dispute/.gitignore
@@ -0,0 +1,3 @@
+venv
+__pycache__
+processed_histories.json
diff --git a/dispute/Dockerfile b/dispute/Dockerfile
@@ -0,0 +1,38 @@
+# Use the official Python image from the Docker Hub
+FROM python:3.10-slim-bullseye
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    gcc \
+    g++ \
+    libopenblas-dev \
+    python3-dev \
+    cmake \
+    git \
+    curl \
+    && apt-get clean
+
+RUN pip install --upgrade pip
+
+WORKDIR /usr/src/app
+
+# Copy and install dependencies
+COPY requirements.txt .
+RUN pip install --upgrade pip
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy application code
+COPY . .
+
+# Specify the command to run on container start
+
+# Set the environment variable
+ARG LLM_API_KEY
+ENV LLM_API_KEY=${LLM_API_KEY}
+# print
+RUN echo "----------"
+RUN echo $LLM_API_KEY
+RUN if [ "$LLM_API_KEY" = "ollama" ]; then python3 verify_ollama.py; fi
+
+CMD ["python", "main.py"]
diff --git a/dispute/README.md b/dispute/README.md
@@ -0,0 +1,31 @@
+## Ollama
+
+```
+LLM_BASE_URL=http://host.docker.internal:11434/v1
+LLM_API_KEY=ollama
+MODELS=llama3.1:8b,qwen:4b,mistral:7b
+SIMULATION_MODEL=llama3.1:8b
+ORCHESTRATOR_URL=https://orchestrator.chasm.net
+WEBHOOK_API_KEY=WlRFN20pc1IpMmlwZ2hBXk9ec1JWNkpzSzJjI2EqNHc
+MIN_CONFIDENCE_SCORE=0.5
+```
+
+## OpenRouter
+
+https://openrouter.ai/docs#models
+
+```
+LLM_API_KEY=
+LLM_BASE_URL=https://openrouter.ai/api/v1
+MODELS=google/gemma-7b-it,meta-llama/llama-3-8b-instruct,microsoft/wizardlm-2-7b,mistralai/mistral-7b-instruct-v0.3
+SIMULATION_MODEL=meta-llama/llama-3-8b-instruct
+```
+
+## Groq
+
+```
+LLM_API_KEY=
+LLM_BASE_URL=https://api.groq.com/openai/v1
+MODELS=llama3-8b-8192,mixtral-8x7b-32768,gemma-7b-it
+SIMULATION_MODEL=llama3-8b-8192
+```
diff --git a/dispute/__init__.py b/dispute/__init__.py
diff --git a/dispute/benchmark.py b/dispute/benchmark.py
@@ -0,0 +1,27 @@
+import dotenv
+dotenv.load_dotenv()
+
+import asyncio
+from util.chasm import ChasmConnection
+from strategy import analyze_text
+
+async def main():
+    connection = ChasmConnection()
+    histories = connection.get_benchmark_test()
+    i = 0
+    for history in histories:
+        print(f"--- {i} ---")
+        result = await analyze_text(history["input"], history["output"])
+        print(f"Result: {result}")
+        print(f"Score: {result['confidence_score']}")
+        print(f"Assert Check: {'✅' if result['correct'] == history['answer'] else '❌'}")
+        print(f"Dispute: {result['dispute']}")
+        i += 1
+    pass
+
+
+if __name__ == "__main__":
+    # set TOKENIZERS_PARALLELISM
+    import os
+    os.environ["TOKENIZERS_PARALLELISM"] = "true"
+    asyncio.run(main())
diff --git a/dispute/config.py b/dispute/config.py
@@ -0,0 +1,20 @@
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+LLM_BASE_URL = os.getenv("LLM_BASE_URL")
+LLM_API_KEY = os.getenv("LLM_API_KEY")
+MODELS = os.getenv("MODELS", "gemma2-9b-it").split(",")
+SIMULATION_MODEL = os.getenv("SIMULATION_MODEL", "gemma2-9b-it")
+ORCHESTRATOR_URL = os.getenv("ORCHESTRATOR_URL")
+WEBHOOK_API_KEY = os.getenv("WEBHOOK_API_KEY")
+MIN_CONFIDENCE_SCORE = float(os.getenv("MIN_CONFIDENCE_SCORE", 0.5))
+LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
+
+print(f"LOG_LEVEL: {LOG_LEVEL}")
+print(f"MIN_CONFIDENCE_SCORE: {MIN_CONFIDENCE_SCORE}")
+print(f"LLM_BASE_URL: {LLM_BASE_URL}")
+print(f"MODELS: {MODELS}")
+print(f"SIMULATION_MODEL: {SIMULATION_MODEL}")
+print(f"ORCHESTRATOR_URL: {ORCHESTRATOR_URL}")
diff --git a/dispute/docker-compose.yml b/dispute/docker-compose.yml
@@ -0,0 +1,19 @@
+version: '3.8'
+
+services:
+  app:
+    network_mode: "host"
+    build:
+      context: .
+      args:
+        LLM_API_KEY: ${LLM_API_KEY}
+    env_file:
+      - .env
+    ports:
+      - "11434:11434"
+    environment:
+      - LLM_API_KEY=${LLM_API_KEY}
+      - LLM_BASE_URL=${LLM_BASE_URL}
+      - MODELS=${MODELS}
+      - SIMULATION_MODEL=${SIMULATION_MODEL}
+    command: python3 main.py
diff --git a/dispute/main.py b/dispute/main.py
@@ -0,0 +1,66 @@
+from config import LOG_LEVEL, MIN_CONFIDENCE_SCORE
+import asyncio
+import os
+from strategy import analyze_text
+from util.chasm import ChasmConnection
+import json
+import logging
+
+logging.basicConfig(
+    level=LOG_LEVEL, 
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+
+chasm = ChasmConnection()
+PROCESSED_HISTORIES_FILE = "processed_histories.json"
+
+def load_processed_histories():
+    if os.path.exists(PROCESSED_HISTORIES_FILE):
+        with open(PROCESSED_HISTORIES_FILE, "r") as f:
+            return set(json.load(f))
+    return set()
+
+def save_processed_histories(histories):
+    with open(PROCESSED_HISTORIES_FILE, "w") as f:
+        json.dump(list(histories), f)
+processed_histories = load_processed_histories()
+
+async def process_histories():
+    histories = chasm.get_prompt_history()
+    logging.info(f"Histories: {len(histories)}")
+    for history in histories:
+        if history["_id"] in processed_histories:
+            logging.debug(f"Skipping already processed history: {history['_id']}")
+            continue
+        input = map(lambda x: x["content"], history["messages"])
+        input = "\n".join(input)
+        output = history["result"]["choices"][0]["message"]["content"]
+        result = await analyze_text(input, output)
+        logging.debug(f"Result: {result}")
+        logging.debug(f"Score: {result['confidence_score']}")
+
+        if result["confidence_score"] > MIN_CONFIDENCE_SCORE:
+            response = chasm.file_dispute(
+                history["_id"],
+                history["messages"],
+                history["result"]["choices"][0]["message"],
+            )
+            logging.info("Dispute filed: ", response)
+
+        # Cache history
+        processed_histories.add(history["_id"])
+        save_processed_histories(processed_histories)
+
+
+
+async def main():
+    while True:
+        await process_histories()
+        await asyncio.sleep(1)
+
+
+if __name__ == "__main__":
+    # set TOKENIZERS_PARALLELISM
+    os.environ["TOKENIZERS_PARALLELISM"] = "true"
+    asyncio.run(main())
diff --git a/dispute/requirements.txt b/dispute/requirements.txt
@@ -0,0 +1,6 @@
+openai==1.30.1
+tiktoken==0.7.0
+blobfile==2.1.1
+scikit-learn==1.5.0
+asyncio==3.4.3
+python-dotenv==1.0.1
diff --git a/dispute/strategies/LLMQuality.py b/dispute/strategies/LLMQuality.py
@@ -0,0 +1,76 @@
+import re
+import logging
+from util.poll import PollAlgo
+from strategies.tokenizer import Tokenizer
+
+
+class LLMQualityStrategy:
+    def __init__(self, models):
+
+        # llama3 tokenizer
+        self.tokenizer = Tokenizer()
+        self.poll = PollAlgo(models)
+
+    def format_text_limit(self, text: str, limit: int):
+        encoded_result = self.tokenizer.encode(text, bos=False, eos=False)
+        return self.tokenizer.decode(encoded_result[:limit])
+
+    async def analyze(self, input: str, output: str):
+        input = self.format_text_limit(input, 3000)
+        output = self.format_text_limit(output, 3000)
+
+        # Prompt from https://arxiv.org/abs/2306.05685v4
+        results = await self.poll.create_poll_completion(
+            messages=[
+                {
+                    "role": "system",
+                    "content": """You are judging whether a model has generated a correct answer to a question. 
+Study the examples the user gives you as they will be very informative for how to do the task.
+Don’t worry about factuality with respect to the real world, just judge the example based on what you see. 
+No need to overthink this task, it really comes down to just soft matching.                     
+If the Provided Answer is correct say exactly "True", otherwise say "False". 
+Only answer the question, do not provide supporting exposition.
+""",
+                },
+                {
+                    "role": "user",
+                    "content": """Question: What element is the main constituent of anthracite?
+Provided Answer: The main constituent of anthracite is carbon. High-grade anthracite, for example, has a chemical formula represented by CHONS, indicating it contains carbon, hydrogen, oxygen, nitrogen, and sulfur, with carbon making up 94% of its composition. Ultra high-grade (UHG) anthracite typically has a minimum carbon content of 95%, further emphasizing carbon as the primary element in anthracite.
+Answer: Carbon
+""",
+                },
+                {"role": "assistant", "content": "True"},
+                {
+                    "role": "user",
+                    "content": """Question: The religious order of Poor Ladies of San Damiano has what more common name?
+Provided Answer: The religious order of Poor Ladies of San Damiano is more commonly known as the Poor Clares. This order was founded by Saints Clare of Assisi and Francis of Assisi in 1212, and it was the second Franciscan Order to be established. The Poor Clares were initially referred to as the Order of Poor Ladies, and later the Clarisses, the Minoresses, the Franciscan Clarist Order, and the Second Order of Saint Francis. However, the name Poor Clares is the most recognized and widely used today.
+Answer: Franciscan Clarist Order
+""",
+                },
+                {"role": "assistant", "content": "False"},
+                {"role": "user", "content": f"Question: {input}\nAnswer: {output}"},
+            ],
+        )
+        logging.debug(results)
+        llm_results = []
+
+        for r in results:
+            assert r is not None, "LLM returned None"
+            r_lower = r.lower()
+            if "true" in r_lower and "false" in r_lower:
+                true_index = re.search(r"\btrue\b", r_lower).start()
+                false_index = re.search(r"\bfalse\b", r_lower).start()
+                llm_results.append(0 if true_index < false_index else 1)
+            elif "true" in r_lower:
+                llm_results.append(0)
+            elif "false" in r_lower:
+                llm_results.append(1)
+            elif "yes" in r_lower:
+                llm_results.append(0)
+            elif "no" in r_lower:
+                llm_results.append(1)
+            else:
+                llm_results.append(0.5)
+        logging.debug(f"LLM Results: {llm_results}")
+        score = sum(llm_results) / len(llm_results)
+        return score
diff --git a/dispute/strategies/ResponseSimilarity.py b/dispute/strategies/ResponseSimilarity.py
@@ -0,0 +1,35 @@
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import asyncio
+from util.llm import llm
+
+class ResponseSimilarityAnalysis:
+    def __init__(self, model: str):
+        self.model = model
+
+    async def analyze(self, input: str, output: str):
+        loop = asyncio.get_event_loop()
+        score = await loop.run_in_executor(None, self._sync_analyse, input, output)
+        return score
+
+    def _sync_analyse(self, input: str, output: str):
+
+        simulated_output = llm.chat.completions.create(
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant"},
+                {"role": "user", "content": input},
+            ],
+            model=self.model,
+            temperature=0.8,
+        )
+        simulated_output_response = simulated_output.choices[0].message.content
+        assert simulated_output_response is not None, "Simulated output is None"
+        score = self.similarity(simulated_output_response, output)
+        return score
+
+    def similarity(self, text1: str, text2: str) -> float:
+        vectorizer = TfidfVectorizer().fit_transform([text1, text2])
+        vectors = vectorizer.toarray()
+        cosine_sim = cosine_similarity(vectors[0:1], vectors[1:])
+        score = cosine_sim[0][0]
+        return 1 - score
diff --git a/dispute/strategies/SemanticSimilarity.py b/dispute/strategies/SemanticSimilarity.py
@@ -0,0 +1,19 @@
+import asyncio
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+
+class SemanticSimilarityAnalysis:
+    def __init__(self):
+        pass
+
+    async def analyze(self, input: str, output: str) -> float:
+        loop = asyncio.get_event_loop()
+        score = await loop.run_in_executor(None, self._sync_analyze, input, output)
+        return score
+
+    def _sync_analyze(self, input: str, output: str) -> float:
+        vectorizer = TfidfVectorizer().fit_transform([input, output])
+        vectors = vectorizer.toarray()
+        cosine_sim = cosine_similarity(vectors[0:1], vectors[1:])
+        score = cosine_sim[0][0]
+        return 1 - score
-Original file line number
+Diff line change
@@ Expand Up / @@ -19,3 +19,6 @@ Thumbs.db @@
     # Logs
     *.log
+    logs/
+    __pycache__/