julep-ai · creatorrr · Aug 9, 2024 · Aug 8, 2024
diff --git a/agents-api/docker-compose.yml b/agents-api/docker-compose.yml
@@ -59,9 +59,9 @@ services:
     container_name: text-embeddings-inference
     environment:
       - DTYPE=float16
-      - MODEL_ID=BAAI/bge-m3
+      - MODEL_ID=Alibaba-NLP/gte-large-en-v1.5
 
-    image: ghcr.io/huggingface/text-embeddings-inference:1.3
+    image: ghcr.io/huggingface/text-embeddings-inference:1.5
     ports:
       - "8082:80"
     volumes:

diff --git a/agents-api/poetry.lock b/agents-api/poetry.lock
diff --git a/agents-api/pyproject.toml b/agents-api/pyproject.toml
@@ -13,23 +13,19 @@ pycozo = {extras = ["embedded"], version = "^0.7.6"}
 uvicorn = "^0.23.2"
 fire = "^0.5.0"
 environs = "^10.3.0"
-google-cloud-aiplatform = "^1.33.0"
 pandas = "^2.1.0"
 openai = "^1.12.0"
 httpx = "^0.26.0"
-async-lru = "^2.0.4"
 sentry-sdk = {extras = ["fastapi"], version = "^1.38.0"}
 temporalio = "^1.4.0"
 pydantic = "^2.5.3"
 arrow = "^1.3.0"
 jinja2 = "^3.1.3"
 jinja2schema = "^0.1.4"
 jsonschema = "^4.21.1"
-litellm = "^1.35.32"
+litellm = "^1.43.3"
 numpy = "^1.26.4"
-transformers = "^4.40.1"
-tiktoken = "^0.6.0"
-xxhash = "^3.4.1"
+tiktoken = "^0.7.0"
 tenacity = "^8.3.0"
 beartype = "^0.18.5"
 pydantic-partial = "^0.5.5"

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -6,6 +6,7 @@ include:
   - ./model-serving/docker-compose.yml
   - ./gateway/docker-compose.yml
   - ./agents-api/docker-compose.yml
+  - ./llm-proxy/docker-compose.yml
 
   # TODO: Enable after testing
   # - ./monitoring/docker-compose.yml
diff --git a/llm-proxy/.dockerignore b/llm-proxy/.dockerignore
@@ -0,0 +1 @@
+!.keys
diff --git a/llm-proxy/.gitignore b/llm-proxy/.gitignore
@@ -0,0 +1 @@
+.keys
diff --git a/llm-proxy/docker-compose.yml b/llm-proxy/docker-compose.yml
@@ -0,0 +1,54 @@
+services:
+  litellm:
+    image: ghcr.io/berriai/litellm:main-stable
+    volumes:
+      - ./litellm-config.yaml:/app/config.yaml
+      - .keys:/app/.keys
+    ports:
+      - "4000:4000"
+    env_file:
+      - ../.env
+    command:
+      [
+        "--config",
+        "/app/config.yaml",
+        "--port",
+        "4000",
+        "--num_workers",
+        "8",
+        "--telemetry",
+        "False"
+      ]
+
+    depends_on:
+      - litellm-db
+      - litellm-redis
+
+  litellm-db:
+    image: postgres
+    restart: always
+    volumes:
+      - litellm-db-data:/var/lib/postgresql/data
+    ports:
+      - "5432:5432"
+    env_file:
+      - ../.env
+    healthcheck:
+      test: [ "CMD-SHELL", "pg_isready -d litellm -U llmproxy" ]
+      interval: 1s
+      timeout: 5s
+      retries: 10
+
+  litellm-redis:
+    image: redis/redis-stack-server
+    restart: always
+    volumes:
+      - litellm-redis-data:/data
+    ports:
+      - "6379:6379"
+    env_file:
+      - ../.env
+
+volumes:
+  litellm-db-data:
+  litellm-redis-data:
diff --git a/llm-proxy/litellm-config.yaml b/llm-proxy/litellm-config.yaml
@@ -0,0 +1,127 @@
+environment_variables:
+  NO_DOCS: "true"
+
+model_list:
+# -*= Paid models =*-
+# -------------------
+
+# Gemini models
+- model_name: gemini-1.5-pro
+  litellm_params:
+    model: vertex_ai_beta/gemini-1.5-pro
+    tags: ["paid"]
+    vertex_credentials: os.environ/GOOGLE_APPLICATION_CREDENTIALS
+
+- model_name: claude-3.5-sonnet
+  litellm_params:
+    model: vertex_ai/claude-3-5-sonnet@20240620
+    tags: ["paid"]
+    vertex_credentials: os.environ/GOOGLE_APPLICATION_CREDENTIALS
+
+# OpenAI models
+- model_name: "gpt-4-turbo"
+  litellm_params:
+    model: "openai/gpt-4-turbo"
+    tags: ["paid"]
+    api_key: os.environ/OPENAI_API_KEY
+
+- model_name: "gpt-4o"
+  litellm_params:
+    model: "openai/gpt-4o"
+    tags: ["paid"]
+    api_key: os.environ/OPENAI_API_KEY
+
+# Anthropic models
+- model_name: "claude-3.5-sonnet"
+  litellm_params:
+    model: "claude-3-5-sonnet-20240620"
+    tags: ["paid"]
+    api_key: os.environ/ANTHROPIC_API_KEY
+
+# Groq models
+- model_name: "llama-3.1-70b"
+  litellm_params:
+    model: "groq/llama-3.1-70b-versatile"
+    tags: ["paid"]
+    api_key: os.environ/GROQ_API_KEY
+
+- model_name: "llama-3.1-8b"
+  litellm_params:
+    model: "groq/llama-3.1-8b-instant"
+    tags: ["paid"]
+    api_key: os.environ/GROQ_API_KEY
+
+
+# -*= Embedding models =*-
+# ------------------------
+
+- model_name: text-embedding-3-large
+  litellm_params:
+    model: "openai/text-embedding-3-large"
+    api_key: os.environ/OPENAI_API_KEY
+    tags: ["paid"]
+
+- model_name: voyage-multilingual-2
+  litellm_params:
+    model: "voyage/voyage-multilingual-2"
+    api_key: os.environ/VOYAGE_API_KEY
+    tags: ["paid"]
+
+- model_name: voyage-large-2
+  litellm_params:
+    model: "voyage/voyage-large-2"
+    api_key: os.environ/VOYAGE_API_KEY
+    tags: ["paid"]
+
+- model_name: gte-large-en-v1.5
+  litellm_params:
+    model: openai/Alibaba-NLP/gte-large-en-v1.5
+    api_base: os.environ/EMBEDDING_SERVICE_BASE
+    tags: ["free"]
+
+- model_name: bge-m3
+  litellm_params:
+    model: openai/BAAI/bge-m3
+    api_base: os.environ/EMBEDDING_SERVICE_BASE
+    tags: ["free"]
+
+
+# -*= Free models =*-
+# -------------------
+
+- model_name: gpt-4o-mini
+  litellm_params:
+    model: openai/gpt-4o-mini
+    api_key: os.environ/OPENAI_API_KEY
+    tags: ["free"]
+
+
+# https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py
+litellm_settings:
+  num_retries: 3
+  request_timeout: 180
+  allowed_fails: 3
+  cooldown_time: 30
+  drop_params: true
+  modify_params: true
+  telemetry: false
+  retry: true
+  add_function_to_prompt: true
+
+  set_verbose: false
+  cache: true
+  cache_params:        # set cache params for redis
+    type: redis
+    namespace: "litellm_caching"
+    host: os.environ/LITELLM_REDIS_HOST
+    port: os.environ/LITELLM_REDIS_PORT
+    password: os.environ/LITELLM_REDIS_PASSWORD
+
+router_settings:
+  routing_strategy: simple-shuffle
+  num_retries: 3
+
+general_settings:
+  master_key: os.environ/LITELLM_MASTER_KEY
+  database_url: os.environ/LITELLM_DATABASE_URL
+  enforce_user_param: true