diff --git a/.github/workflows/build-docker.yml b/.github/workflows/build-docker.yml new file mode 100644 index 0000000..31439ed --- /dev/null +++ b/.github/workflows/build-docker.yml @@ -0,0 +1,68 @@ +name: Build and Publish Docker Image + +on: + workflow_dispatch: + push: + branches: + - 'main' + release: + types: [published] + +jobs: + build-and-push-image: + runs-on: ubuntu-latest + + permissions: + contents: read + packages: write + + env: + # Set up environment variables for the job + DOCKER_REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + TAG: ${{ github.sha }} + + steps: + - name: Check out code + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + with: + install: true + + # Log in to the GitHub Container Registry only when not running on a pull request event + - name: Login to Docker Registry + uses: docker/login-action@v2 + with: + registry: ${{ env.DOCKER_REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v4 + with: + images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }} + + # Build and push the Docker image to GHCR for the main branch or specific tags + - name: Build and Push Docker Image + if: github.ref == 'refs/heads/main' + uses: docker/build-push-action@v4 + with: + context: . + file: Dockerfile + push: true + tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest + labels: version=${{ github.run_id }} + + # For tagged releases, build and push the Docker image with the corresponding tag + - name: Build and Push Docker Image (Tagged) + if: startsWith(github.ref, 'refs/tags/') + uses: docker/build-push-action@v4 + with: + context: . + file: Dockerfile + push: true + tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }} + labels: version=${{ github.run_id }} \ No newline at end of file diff --git a/.gitignore b/.gitignore index 68bc17f..7504e96 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# +hf_home/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..bfa36a3 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,9 @@ +FROM python:3-slim + +RUN mkdir -p /app +WORKDIR /app +COPY requirements.txt . +RUN pip install -r requirements.txt +COPY *.py . +COPY backend /app/backend +CMD python vision.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..f975113 --- /dev/null +++ b/README.md @@ -0,0 +1,70 @@ +OpenedAI Vision +--------------- + +An OpenAI API compatible vision server, it functions like `gpt-4-vision-preview` and lets you chat about the contents of an image. + +- Compatible with the OpenAI Vision API (aka "chat with images") +- Does not connect to the OpenAI API and does not require an OpenAI API Key +- Not affiliated with OpenAI in any way + +Backend Model support: +- [X] Moondream2 [vikhyatk/moondream2](https://huggingface.co/vikhyatk/moondream2) *(only a single image and single question currently supported) +- [ ] Deepseek-VL - (in progress) [deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat) +- [ ] ... + +Version: 0.1.0 + + +API Documentation +----------------- + +* [OpenAI Vision guide](https://platform.openai.com/docs/guides/vision) + +Installation instructions +------------------------- + +```shell +# install the python dependencies +pip install -r requirements.txt +# run the server +python vision.py +``` + +Usage +----- + +``` +usage: vision.py [-h] [-m MODEL] [-b BACKEND] [-d DEVICE] [-P PORT] [-H HOST] [--preload] + +OpenedAI Vision API Server + +options: + -h, --help show this help message and exit + -m MODEL, --model MODEL + The model to use, Ex. deepseek-ai/deepseek-vl-7b-chat (default: vikhyatk/moondream2) + -b BACKEND, --backend BACKEND + The backend to use (moondream, deepseek) (default: moondream) + -d DEVICE, --device DEVICE + Set the torch device for the model. Ex. cuda:1 (default: auto) + -P PORT, --port PORT Server tcp port (default: 5006) + -H HOST, --host HOST Host to listen on, Ex. 0.0.0.0 (default: localhost) + --preload Preload model and exit. (default: False) +``` + +Docker support +-------------- + +You can run the server via docker like so: +```shell +docker compose up +``` + +Sample API Usage +---------------- + +`test_vision.py` has a sample of how to use the API. +Example: +``` +$ test_vision.py https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg +The image features a long wooden boardwalk running through a lush green field. The boardwalk is situated in a grassy area with trees in the background, creating a serene and picturesque scene. The sky above is filled with clouds, adding to the beauty of the landscape. The boardwalk appears to be a peaceful path for people to walk or hike along, providing a connection between the grassy field and the surrounding environment. +``` diff --git a/backend/moondream.py b/backend/moondream.py new file mode 100644 index 0000000..4e5fa36 --- /dev/null +++ b/backend/moondream.py @@ -0,0 +1,31 @@ + +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM + +from vision_qna import VisionQnABase + +class VisionQnA(VisionQnABase): + model_name: str = "moondream2" + revision: str = '2024-03-13' + + def __init__(self, model_id: str, device: str): + if device == 'auto': + device = self.select_device() + + params = { + 'pretrained_model_name_or_path': model_id, + 'trust_remote_code': True, + 'revision': self.revision, + 'torch_dtype': torch.float32 if device == 'cpu' else torch.float16, + } + + self.model = AutoModelForCausalLM.from_pretrained(**params).to(device) + self.tokenizer = AutoTokenizer.from_pretrained(model_id) + + def select_device(self): + return 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu' + + async def single_question(self, image_url: str, prompt: str) -> str: + image = await self.url_to_image(image_url) + encoded_image = self.model.encode_image(image) + return self.model.answer_question(encoded_image, prompt, self.tokenizer) diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..db376d1 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,23 @@ +services: + server: + build: + context: . + dockerfile: Dockerfile + tty: true + image: ghcr.io/matatonic/openedai-vision + environment: + - HF_HOME=/app/hf_home + volumes: + - ./hf_home:/app/hf_home + ports: + - 5006:5006 + command: ["python", "vision.py", "--host", "0.0.0.0", "--port", "5006"] + runtime: nvidia + deploy: + resources: + reservations: + devices: + - driver: nvidia + #device_ids: ['0', '1'] # Select a gpu, or + count: all + capabilities: [gpu] diff --git a/hf_home/hf_home.txt b/hf_home/hf_home.txt new file mode 100644 index 0000000..e69de29 diff --git a/openedai.py b/openedai.py new file mode 100644 index 0000000..0de55b3 --- /dev/null +++ b/openedai.py @@ -0,0 +1,66 @@ +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import PlainTextResponse + +class OpenAIStub(FastAPI): + def __init__(self) -> None: + super().__init__() + self.models = {} + + self.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"] + ) + + @self.get('/v1/billing/usage') + @self.get('/v1/dashboard/billing/usage') + async def handle_billing_usage(): + return { 'total_usage': 0 } + + @self.get("/", response_class=PlainTextResponse) + @self.head("/", response_class=PlainTextResponse) + @self.options("/", response_class=PlainTextResponse) + async def root(): + return PlainTextResponse(content="", status_code=200 if self.models else 503) + + @self.get("/health") + async def health(): + return {"status": "ok" if self.models else "unk" } + + @self.get("/v1/models") + async def get_model_list(): + return self.model_list() + + @self.get("/v1/models/{model}") + async def get_model_info(model_id: str): + return self.model_info(model_id) + + def register_model(self, name: str, model: str = None) -> None: + self.models[name] = model if model else name + + def deregister_model(self, name: str) -> None: + if name in self.models: + del self.models[name] + + def model_info(self, model: str) -> dict: + result = { + "id": model, + "object": "model", + "created": 0, + "owned_by": "user" + } + return result + + def model_list(self) -> dict: + if not self.models: + return {} + + result = { + "object": "list", + "data": [ self.model_info(model) for model in list(set(self.models.keys() | self.models.values())) if model ] + } + + return result diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b73db69 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +python-datauri +requests +uvicorn +fastapi + +# moondream +timm +einops +transformers>=4.39.* \ No newline at end of file diff --git a/test_vision.py b/test_vision.py new file mode 100755 index 0000000..747a50b --- /dev/null +++ b/test_vision.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python +import argparse +from datauri import DataURI +from openai import OpenAI + +# Initialize argparse +parser = argparse.ArgumentParser(description='Test vision using OpenAI') +parser.add_argument('image_url', type=str, help='URL or image file to be tested') +parser.add_argument('question', type=str, nargs='?', default='Describe the image', help='The question to ask the image') +args = parser.parse_args() + +client = OpenAI(base_url='http://localhost:5006/v1', api_key='skip') + +image_url = args.image_url +question = args.question + +if not image_url.startswith('http'): + image_url = str(DataURI.from_file(image_url)) + +response = client.chat.completions.create( + model="gpt-4-vision-preview", + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": question}, + { + "type": "image_url", + "image_url": { + "url": image_url, + }, + }, + ], + } + ], + max_tokens=300, +) + +print(response.choices[0].message.content) \ No newline at end of file diff --git a/vision.py b/vision.py new file mode 100644 index 0000000..3955c5f --- /dev/null +++ b/vision.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +import sys +import time +import argparse +import importlib + +from typing import Optional, List, Literal +import uvicorn +from pydantic import BaseModel + +import openedai + + +app = openedai.OpenAIStub() + +class ImageURL(BaseModel): + url: str + detail: Optional[str] = "auto" # auto -> low (512) or high (Nx512) based on res. + +class Content(BaseModel): + type: Literal["text", "image_url"] + text: Optional[str] = None + image_url: Optional[ImageURL] = None + +class Message(BaseModel): + role: str + content: List[Content] + +class ImageChatRequest(BaseModel): + model: str # = "gpt-4-vision-preview" + messages: List[Message] + max_tokens: int = 300 + +@app.post(path="/v1/chat/completions") +async def chat_with_images(request: ImageChatRequest): + + # XXX only single image & prompt for now + for c in request.messages[0].content: + if c.image_url: + image_url = c.image_url.url + elif c.text: + prompt = c.text + + text = await vision_qna.single_question(image_url, prompt) + + t_id = int(time.time() * 1e9) + + vis_chat_resp = { + "id": f"chatcmpl-{t_id}", + "object": "chat.completion", + "created": t_id, + "model": vision_qna.model_name, + "system_fingerprint": "fp_111111111", + "choices": [{ + "index": 0, + "message": { + "role": "assistant", + "content": text, + }, + "logprobs": None, + "finish_reason": "stop" + }], + "usage": { + "prompt_tokens": 0, + "completion_tokens": 0, + "total_tokens": 0 + } + } + + return vis_chat_resp + +def parse_args(argv=None): + parser = argparse.ArgumentParser( + description='OpenedAI Vision API Server', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + parser.add_argument('-m', '--model', action='store', default="vikhyatk/moondream2", help="The model to use, Ex. deepseek-ai/deepseek-vl-7b-chat") + parser.add_argument('-b', '--backend', action='store', default="moondream", help="The backend to use (moondream, deepseek)") + parser.add_argument('-d', '--device', action='store', default="auto", help="Set the torch device for the model. Ex. cuda:1") + parser.add_argument('-P', '--port', action='store', default=5006, type=int, help="Server tcp port") + parser.add_argument('-H', '--host', action='store', default='localhost', help="Host to listen on, Ex. 0.0.0.0") + parser.add_argument('--preload', action='store_true', help="Preload model and exit.") + return parser.parse_args() + +if __name__ == "__main__": + args = parse_args(sys.argv[1:]) + + print(f"Loading VisionQnA[{args.backend}] with {args.model}") + backend = importlib.import_module(f'backend.{args.backend}') + vision_qna = backend.VisionQnA(args.model, args.device) + + if args.preload: + sys.exit(0) + + app.register_model('gpt-4-vision-preview', args.model) + + uvicorn.run(app, host=args.host, port=args.port) diff --git a/vision_qna.py b/vision_qna.py new file mode 100644 index 0000000..ceeb1a9 --- /dev/null +++ b/vision_qna.py @@ -0,0 +1,24 @@ + +import io +import requests +from datauri import DataURI +from PIL import Image + +class VisionQnABase: + model_name: str = None + + def __init__(self, model_id: str, device: str): + pass + + async def url_to_image(self, img_url: str) -> Image.Image: + if img_url.startswith('http'): + response = requests.get(img_url) + + img_data = response.content + elif img_url.startswith('data:'): + img_data = DataURI(img_url).data + + return Image.open(io.BytesIO(img_data)).convert("RGB") + + async def single_question(self, image_url: str, prompt: str) -> str: + pass