initial

matatonic · Mar 31, 2024 · 93bc2bc · 93bc2bc
1 parent 7fba7e2
commit 93bc2bc
Show file tree

Hide file tree

Showing 12 changed files with 439 additions and 0 deletions.
diff --git a/.github/workflows/build-docker.yml b/.github/workflows/build-docker.yml
@@ -0,0 +1,68 @@
+name: Build and Publish Docker Image
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - 'main'
+  release:
+    types: [published]
+
+jobs:
+  build-and-push-image:
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: read
+      packages: write
+
+    env:
+      # Set up environment variables for the job
+      DOCKER_REGISTRY: ghcr.io
+      IMAGE_NAME: ${{ github.repository }}
+      TAG: ${{ github.sha }}
+
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+        with:
+          install: true
+
+      # Log in to the GitHub Container Registry only when not running on a pull request event
+      - name: Login to Docker Registry
+        uses: docker/login-action@v2
+        with:
+          registry: ${{ env.DOCKER_REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v4
+        with:
+          images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}
+
+      # Build and push the Docker image to GHCR for the main branch or specific tags
+      - name: Build and Push Docker Image
+        if: github.ref == 'refs/heads/main'
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: Dockerfile
+          push: true
+          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
+          labels: version=${{ github.run_id }}
+
+      # For tagged releases, build and push the Docker image with the corresponding tag
+      - name: Build and Push Docker Image (Tagged)
+        if: startsWith(github.ref, 'refs/tags/')
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: Dockerfile
+          push: true
+          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
+          labels: version=${{ github.run_id }}
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+#
+hf_home/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,9 @@
+FROM python:3-slim
+
+RUN mkdir -p /app
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+COPY *.py .
+COPY backend /app/backend
+CMD python vision.py
diff --git a/README.md b/README.md
@@ -0,0 +1,70 @@
+OpenedAI Vision
+---------------
+
+An OpenAI API compatible vision server, it functions like `gpt-4-vision-preview` and lets you chat about the contents of an image.
+
+- Compatible with the OpenAI Vision API (aka "chat with images")
+- Does not connect to the OpenAI API and does not require an OpenAI API Key
+- Not affiliated with OpenAI in any way
+
+Backend Model support:
+- [X] Moondream2 [vikhyatk/moondream2](https://huggingface.co/vikhyatk/moondream2) *(only a single image and single question currently supported)
+- [ ] Deepseek-VL - (in progress) [deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)
+- [ ] ...
+
+Version: 0.1.0
+
+
+API Documentation
+-----------------
+
+* [OpenAI Vision guide](https://platform.openai.com/docs/guides/vision)
+
+Installation instructions
+-------------------------
+
+```shell
+# install the python dependencies
+pip install -r requirements.txt
+# run the server
+python vision.py
+```
+
+Usage
+-----
+
+```
+usage: vision.py [-h] [-m MODEL] [-b BACKEND] [-d DEVICE] [-P PORT] [-H HOST] [--preload]
+
+OpenedAI Vision API Server
+
+options:
+  -h, --help            show this help message and exit
+  -m MODEL, --model MODEL
+                        The model to use, Ex. deepseek-ai/deepseek-vl-7b-chat (default: vikhyatk/moondream2)
+  -b BACKEND, --backend BACKEND
+                        The backend to use (moondream, deepseek) (default: moondream)
+  -d DEVICE, --device DEVICE
+                        Set the torch device for the model. Ex. cuda:1 (default: auto)
+  -P PORT, --port PORT  Server tcp port (default: 5006)
+  -H HOST, --host HOST  Host to listen on, Ex. 0.0.0.0 (default: localhost)
+  --preload             Preload model and exit. (default: False)
+```
+
+Docker support
+--------------
+
+You can run the server via docker like so:
+```shell
+docker compose up
+```
+
+Sample API Usage
+----------------
+
+`test_vision.py` has a sample of how to use the API.
+Example:
+```
+$ test_vision.py https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg
+The image features a long wooden boardwalk running through a lush green field. The boardwalk is situated in a grassy area with trees in the background, creating a serene and picturesque scene. The sky above is filled with clouds, adding to the beauty of the landscape. The boardwalk appears to be a peaceful path for people to walk or hike along, providing a connection between the grassy field and the surrounding environment.
+```
diff --git a/backend/moondream.py b/backend/moondream.py
@@ -0,0 +1,31 @@
+
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+from vision_qna import VisionQnABase
+
+class VisionQnA(VisionQnABase):
+    model_name: str = "moondream2"
+    revision: str = '2024-03-13'
+
+    def __init__(self, model_id: str, device: str):
+        if device == 'auto':
+            device = self.select_device()
+
+        params = {
+            'pretrained_model_name_or_path': model_id,
+            'trust_remote_code': True,
+            'revision': self.revision,
+            'torch_dtype': torch.float32 if device == 'cpu' else torch.float16,
+        }
+
+        self.model = AutoModelForCausalLM.from_pretrained(**params).to(device)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+    def select_device(self):
+        return 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
+
+    async def single_question(self, image_url: str, prompt: str) -> str:
+        image = await self.url_to_image(image_url)
+        encoded_image = self.model.encode_image(image)
+        return self.model.answer_question(encoded_image, prompt, self.tokenizer)
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,23 @@
+services:
+  server:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    tty: true
+    image: ghcr.io/matatonic/openedai-vision
+    environment:
+      - HF_HOME=/app/hf_home
+    volumes:
+      - ./hf_home:/app/hf_home
+    ports:
+      - 5006:5006
+    command: ["python", "vision.py", "--host", "0.0.0.0", "--port", "5006"]
+    runtime: nvidia
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              #device_ids: ['0', '1'] # Select a gpu, or
+              count: all
+              capabilities: [gpu]
diff --git a/hf_home/hf_home.txt b/hf_home/hf_home.txt
diff --git a/openedai.py b/openedai.py
@@ -0,0 +1,66 @@
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import PlainTextResponse
+
+class OpenAIStub(FastAPI):
+    def __init__(self) -> None:
+        super().__init__()
+        self.models = {}
+
+        self.add_middleware(
+            CORSMiddleware,
+            allow_origins=["*"],
+            allow_credentials=True,
+            allow_methods=["*"],
+            allow_headers=["*"]
+        )
+
+        @self.get('/v1/billing/usage')
+        @self.get('/v1/dashboard/billing/usage')
+        async def handle_billing_usage():
+            return { 'total_usage': 0 }
+
+        @self.get("/", response_class=PlainTextResponse)
+        @self.head("/", response_class=PlainTextResponse)
+        @self.options("/", response_class=PlainTextResponse)
+        async def root():
+            return PlainTextResponse(content="", status_code=200 if self.models else 503)
+
+        @self.get("/health")
+        async def health():
+            return {"status": "ok" if self.models else "unk" }
+
+        @self.get("/v1/models")
+        async def get_model_list():
+            return self.model_list()
+
+        @self.get("/v1/models/{model}")
+        async def get_model_info(model_id: str):
+            return self.model_info(model_id)
+
+    def register_model(self, name: str, model: str = None) -> None:
+        self.models[name] = model if model else name
+
+    def deregister_model(self, name: str) -> None:
+        if name in self.models:
+            del self.models[name]
+
+    def model_info(self, model: str) -> dict:
+        result = {
+            "id": model,
+            "object": "model",
+            "created": 0,
+            "owned_by": "user"
+        }
+        return result
+
+    def model_list(self) -> dict:
+        if not self.models:
+            return {}
+
+        result = {
+            "object": "list",
+            "data": [ self.model_info(model) for model in list(set(self.models.keys() | self.models.values())) if model ]
+        }
+
+        return result
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,9 @@
+python-datauri
+requests
+uvicorn
+fastapi
+
+# moondream
+timm
+einops
+transformers>=4.39.*
diff --git a/test_vision.py b/test_vision.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python
+import argparse
+from datauri import DataURI
+from openai import OpenAI
+
+# Initialize argparse
+parser = argparse.ArgumentParser(description='Test vision using OpenAI')
+parser.add_argument('image_url', type=str, help='URL or image file to be tested')
+parser.add_argument('question', type=str, nargs='?', default='Describe the image', help='The question to ask the image')
+args = parser.parse_args()
+
+client = OpenAI(base_url='http://localhost:5006/v1', api_key='skip')
+
+image_url = args.image_url
+question = args.question
+
+if not image_url.startswith('http'):
+    image_url = str(DataURI.from_file(image_url))
+
+response = client.chat.completions.create(
+  model="gpt-4-vision-preview",
+  messages=[
+    {
+      "role": "user",
+      "content": [
+        {"type": "text", "text": question},
+        {
+          "type": "image_url",
+          "image_url": {
+            "url": image_url,
+          },
+        },
+      ],
+    }
+  ],
+  max_tokens=300,
+)
+
+print(response.choices[0].message.content)