0.4.0

matatonic · Apr 4, 2024 · a733035 · a733035
1 parent ead3989
commit a733035
Show file tree

Hide file tree

Showing 15 changed files with 362 additions and 130 deletions.
diff --git a/.github/workflows/build-docker.yml b/.github/workflows/build-docker.yml
@@ -65,4 +65,120 @@ jobs:
           file: Dockerfile
           push: true
           tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
-          labels: version=${{ github.run_id }}
+          labels: version=${{ github.run_id }}
+
+  build-and-push-cogvlm-image:
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: read
+      packages: write
+
+    env:
+      # Set up environment variables for the job
+      DOCKER_REGISTRY: ghcr.io
+      IMAGE_NAME: matatonic/cogvlm
+      TAG: ${{ github.sha }}
+
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+        with:
+          install: true
+
+      # Log in to the GitHub Container Registry only when not running on a pull request event
+      - name: Login to Docker Registry
+        uses: docker/login-action@v2
+        with:
+          registry: ${{ env.DOCKER_REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v4
+        with:
+          images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}
+
+      # Build and push the Docker image to GHCR for the main branch or specific tags
+      - name: Build and Push Docker Image
+        if: github.ref == 'refs/heads/main'
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: Dockerfile.cogvlm
+          push: true
+          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
+          labels: version=${{ github.run_id }}
+
+      # For tagged releases, build and push the Docker image with the corresponding tag
+      - name: Build and Push Docker Image (Tagged)
+        if: startsWith(github.ref, 'refs/tags/')
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: Dockerfile.cogvlm
+          push: true
+          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
+          labels: version=${{ github.run_id }}
+
+  build-and-push-yi-vl-image:
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: read
+      packages: write
+
+    env:
+      # Set up environment variables for the job
+      DOCKER_REGISTRY: ghcr.io
+      IMAGE_NAME: matatonic/yi-vl
+      TAG: ${{ github.sha }}
+
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+        with:
+          install: true
+
+      # Log in to the GitHub Container Registry only when not running on a pull request event
+      - name: Login to Docker Registry
+        uses: docker/login-action@v2
+        with:
+          registry: ${{ env.DOCKER_REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v4
+        with:
+          images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}
+
+      # Build and push the Docker image to GHCR for the main branch or specific tags
+      - name: Build and Push Docker Image
+        if: github.ref == 'refs/heads/main'
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: Dockerfile.yi-vl
+          push: true
+          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
+          labels: version=${{ github.run_id }}
+
+      # For tagged releases, build and push the Docker image with the corresponding tag
+      - name: Build and Push Docker Image (Tagged)
+        if: startsWith(github.ref, 'refs/tags/')
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: Dockerfile.yi-vl
+          push: true
+          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
+          labels: version=${{ github.run_id }}
diff --git a/Dockerfile b/Dockerfile
@@ -2,10 +2,14 @@ FROM python:3.11-slim
 
 RUN mkdir -p /app
 WORKDIR /app
-COPY requirements.txt .
+
 RUN pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
-RUN pip install -r requirements.txt
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY requirements.*.txt .
+RUN for r in requirements.*.txt ; do pip install --no-cache-dir -r $r; done
 
 COPY *.py .
 COPY backend /app/backend
-CMD python vision.py
+CMD python vision.py
diff --git a/Dockerfile.cogvlm b/Dockerfile.cogvlm
@@ -0,0 +1,10 @@
+FROM python:3.11-slim
+
+ADD https://github.com/THUDM/CogVLM/raw/main/openai_demo/openai_api.py /usr/src/
+WORKDIR /usr/src
+
+# reduced dependencies for smaller image size
+RUN pip install --no-cache-dir transformers>=4.36.2 torch>=2.1.0 torchvision>=0.16.2 pydantic>=2.6.0 fastapi>=0.109.0 uvicorn>=0.27.0 loguru~=0.7.2 sse-starlette>=1.8.2 \
+	sxformers>=0.0.22 accelerate>=0.26.1 pillow>=10.2.0 timm>=0.9.12 einops sentencepiece protobuf bitsandbytes 
+
+CMD python openai_api.py
diff --git a/Dockerfile.yi-vl b/Dockerfile.yi-vl
@@ -0,0 +1,10 @@
+FROM python:3.11-slim
+
+RUN apt-get update && apt-get install -y git
+
+RUN git clone https://github.com/01-ai/Yi /app
+WORKDIR /app
+RUN pip install --no-cache-dir -r requirements.txt
+RUN pip install --no-cache-dir loguru openai sse-starlette tiktoken
+
+CMD python VL/openai_api.py
diff --git a/README.md b/README.md
@@ -8,21 +8,35 @@ An OpenAI API compatible vision server, it functions like `gpt-4-vision-preview`
 - Not affiliated with OpenAI in any way
 
 Backend Model support:
-- [X] Moondream2 [vikhyatk/moondream2](https://huggingface.co/vikhyatk/moondream2) *(only supports a single image)
-- [ ] Moondream1 [vikhyatk/moondream1](https://huggingface.co/vikhyatk/moondream1) *(broken for me)
-- [X] LlavaNext [llava-v1.6-mistral-7b-hf, llava-v1.6-34b-hf (llava-v1.6-34b-hf is not working well yet)](https://huggingface.co/llava-hf) *(only supports a single image)
-- [X] Llava [llava-v1.5-vicuna-7b-hf, llava-v1.5-vicuna-13b-hf, llava-v1.5-bakLlava-7b-hf](https://huggingface.co/llava-hf) *(only supports a single image)
+- [X] [LlavaNext](https://huggingface.co/llava-hf) - (llava-v1.6-mistral-7b-hf, llava-v1.6-34b-hf - llava-v1.6-34b-hf is not working well yet) *(only supports a single image)
+- [X] [Llava](https://huggingface.co/llava-hf) - (llava-v1.5-vicuna-7b-hf, llava-v1.5-vicuna-13b-hf, llava-v1.5-bakLlava-7b-hf) *(only supports a single image)
+- [X] [Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)
+- [X] Moondream2 - [vikhyatk/moondream2](https://huggingface.co/vikhyatk/moondream2) *(only supports a single image)
+- [ ] Moondream1 - [vikhyatk/moondream1](https://huggingface.co/vikhyatk/moondream1)
 - [ ] Deepseek-VL - [deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)
+- [ ] [openbmb/OmniLMM-12B](https://huggingface.co/openbmb/OmniLMM-12B)
+- [ ] [echo840/Monkey](https://huggingface.co/echo840/Monkey)
 - [ ] ...
 
-Version: 0.3.0
+
+Some vision systems include their own OpenAI compatible API server. Also included are some pre-built images and docker-compose for them:
+- [X] [THUDM/CogVLM](https://github.com/THUDM/CogVLM) ([cogvlm-chat-hf](https://huggingface.co/THUDM/cogvlm-chat-hf), [cogagent-chat-hf](https://huggingface.co/THUDM/cogagent-chat-hf)), `docker-compose.cogvlm.yml` **Recommended for 16GB-40GB GPU**s
+- [X] [01-ai](https://huggingface.co/01-ai)/Yi-VL ([Yi-VL-6B](https://huggingface.co/01-ai/Yi-VL-6B), [Yi-VL-34B](https://huggingface.co/01-ai/Yi-VL-34B)), `docker-compose.yi-vl.yml`
+
+Version: 0.4.0
 
 Recent updates:
-- llava (1.5) / llavanext (1.6+) backends
+- Yi-VL and CogVLM (docker containers only)
+- new backend: Qwen-VL
+- new backend: llava (1.5)
+- new backend: llavanext (1.6+)
 - multi-turn questions & answers
-- chat_with_images.py test tool
+- chat_with_images.py test tool and code sample
 - selectable chat formats (phi15, vicuna, chatml, llama2/mistral)
-- flash attention 2, accelerate, bitsandbytes (4bit, 8bit) support
+- flash attention 2, accelerate (device split), bitsandbytes (4bit, 8bit) support
+
+
+See: [OpenVLM Leaderboard](https://huggingface.co/spaces/opencompass/open_vlm_leaderboard)
 
 
 API Documentation
@@ -36,6 +50,10 @@ Installation instructions
 ```shell
 # install the python dependencies
 pip install -r requirements.txt
+# Install backend specific requirements (or select only backends you plan to use)
+pip install -r requirements.moondream.txt -r requirements.qwen-vl.txt
+# install the package
+pip install .
 # run the server
 python vision.py
 ```
@@ -53,16 +71,16 @@ options:
   -m MODEL, --model MODEL
                         The model to use, Ex. llava-hf/llava-v1.6-mistral-7b-hf (default: vikhyatk/moondream2)
   -b BACKEND, --backend BACKEND
-                        The backend to use (moondream1, moondream2, llavanext, llava) (default: moondream2)
+                        The backend to use (moondream1, moondream2, llavanext, llava, qwen-vl) (default: moondream2)
   -f FORMAT, --format FORMAT
                         Force a specific chat format. (vicuna, mistral, chatml, llama2, phi15) (default: None)
-  --load-in-4bit        load in 4bit (default: False)
-  --load-in-8bit        load in 8bit (default: False)
-  --use-flash-attn      Use Flash Attention 2 (default: False)
+  --load-in-4bit        load in 4bit (doesn't work with all models) (default: False)
+  --load-in-8bit        load in 8bit (doesn't work with all models) (default: False)
+  --use-flash-attn      Use Flash Attention 2 (doesn't work with all models or GPU) (default: False)
   -d DEVICE, --device DEVICE
                         Set the torch device for the model. Ex. cuda:1 (default: auto)
   -P PORT, --port PORT  Server tcp port (default: 5006)
-  -H HOST, --host HOST  Host to listen on, Ex. 0.0.0.0 (default: localhost)
+  -H HOST, --host HOST  Host to listen on, Ex. localhost (default: 0.0.0.0)
   --preload             Preload model and exit. (default: False)
 ```
 
@@ -89,3 +107,4 @@ Answer: No, there are no animals visible in the picture. The focus is on the pat
 
 Question: 
 ```
+
diff --git a/backend/deepseek-vl.py b/backend/deepseek-vl.py
@@ -0,0 +1,61 @@
+
+print("deepseek is a WORK IN PROGRESS and doesn't work yet.")
+
+from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM
+from deepseek_vl.utils.io import load_pil_images
+
+# specify the path to the model
+# model_path = "deepseek-ai/deepseek-vl-7b-chat"
+
+class VisionQnA(VisionQnABase):
+    model_name: str = "deepseek-vl"
+    format: str = ''
+
+    def __init__(self, model_id: str, device: str, extra_params = {}, format = None):
+        super().__init__(model_id, device, extra_params, format)
+
+        self.processor = VLChatProcessor.from_pretrained(model_id)
+        self.model = MultiModalityCausalLM.from_pretrained(**self.params)
+
+        print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")
+
+    async def chat_with_images(self, messages: list[Message], max_tokens: int) -> str:
+        # XXX WIP
+        conversation = [
+            {
+                "role": "User",
+                "content": "<image_placeholder>Describe each stage of this image.",
+                "images": ["./images/training_pipelines.jpg"]
+            },
+            {
+                "role": "Assistant",
+                "content": ""
+            }
+        ]
+
+        # load images and prepare for inputs
+        pil_images = load_pil_images(conversation)
+        prepare_inputs = vl_chat_processor(
+            conversations=conversation,
+            images=pil_images,
+            force_batchify=True
+        ).to(self.model.device)
+
+        # run image encoder to get the image embeddings
+        inputs_embeds = self.model.prepare_inputs_embeds(**prepare_inputs)
+
+        # run the model to get the response
+        outputs = self.model.language_model.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=prepare_inputs.attention_mask,
+            pad_token_id=tokenizer.eos_token_id,
+            bos_token_id=tokenizer.bos_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+            max_new_tokens=512,
+            do_sample=False,
+            use_cache=True
+        )
+
+        answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
+        print(f"{prepare_inputs['sft_format'][0]}", answer)
+