Skip to content

Commit

Permalink
0.4.0
Browse files Browse the repository at this point in the history
  • Loading branch information
matatonic committed Apr 4, 2024
1 parent ead3989 commit a733035
Show file tree
Hide file tree
Showing 15 changed files with 362 additions and 130 deletions.
118 changes: 117 additions & 1 deletion .github/workflows/build-docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,120 @@ jobs:
file: Dockerfile
push: true
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
labels: version=${{ github.run_id }}
labels: version=${{ github.run_id }}

build-and-push-cogvlm-image:
runs-on: ubuntu-latest

permissions:
contents: read
packages: write

env:
# Set up environment variables for the job
DOCKER_REGISTRY: ghcr.io
IMAGE_NAME: matatonic/cogvlm
TAG: ${{ github.sha }}

steps:
- name: Check out code
uses: actions/checkout@v4

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
with:
install: true

# Log in to the GitHub Container Registry only when not running on a pull request event
- name: Login to Docker Registry
uses: docker/login-action@v2
with:
registry: ${{ env.DOCKER_REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@v4
with:
images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}

# Build and push the Docker image to GHCR for the main branch or specific tags
- name: Build and Push Docker Image
if: github.ref == 'refs/heads/main'
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile.cogvlm
push: true
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
labels: version=${{ github.run_id }}

# For tagged releases, build and push the Docker image with the corresponding tag
- name: Build and Push Docker Image (Tagged)
if: startsWith(github.ref, 'refs/tags/')
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile.cogvlm
push: true
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
labels: version=${{ github.run_id }}

build-and-push-yi-vl-image:
runs-on: ubuntu-latest

permissions:
contents: read
packages: write

env:
# Set up environment variables for the job
DOCKER_REGISTRY: ghcr.io
IMAGE_NAME: matatonic/yi-vl
TAG: ${{ github.sha }}

steps:
- name: Check out code
uses: actions/checkout@v4

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
with:
install: true

# Log in to the GitHub Container Registry only when not running on a pull request event
- name: Login to Docker Registry
uses: docker/login-action@v2
with:
registry: ${{ env.DOCKER_REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@v4
with:
images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}

# Build and push the Docker image to GHCR for the main branch or specific tags
- name: Build and Push Docker Image
if: github.ref == 'refs/heads/main'
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile.yi-vl
push: true
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
labels: version=${{ github.run_id }}

# For tagged releases, build and push the Docker image with the corresponding tag
- name: Build and Push Docker Image (Tagged)
if: startsWith(github.ref, 'refs/tags/')
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile.yi-vl
push: true
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
labels: version=${{ github.run_id }}
10 changes: 7 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@ FROM python:3.11-slim

RUN mkdir -p /app
WORKDIR /app
COPY requirements.txt .

RUN pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
RUN pip install -r requirements.txt
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

COPY requirements.*.txt .
RUN for r in requirements.*.txt ; do pip install --no-cache-dir -r $r; done

COPY *.py .
COPY backend /app/backend
CMD python vision.py
CMD python vision.py
10 changes: 10 additions & 0 deletions Dockerfile.cogvlm
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
FROM python:3.11-slim

ADD https://github.com/THUDM/CogVLM/raw/main/openai_demo/openai_api.py /usr/src/
WORKDIR /usr/src

# reduced dependencies for smaller image size
RUN pip install --no-cache-dir transformers>=4.36.2 torch>=2.1.0 torchvision>=0.16.2 pydantic>=2.6.0 fastapi>=0.109.0 uvicorn>=0.27.0 loguru~=0.7.2 sse-starlette>=1.8.2 \
sxformers>=0.0.22 accelerate>=0.26.1 pillow>=10.2.0 timm>=0.9.12 einops sentencepiece protobuf bitsandbytes

CMD python openai_api.py
10 changes: 10 additions & 0 deletions Dockerfile.yi-vl
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
FROM python:3.11-slim

RUN apt-get update && apt-get install -y git

RUN git clone https://github.com/01-ai/Yi /app
WORKDIR /app
RUN pip install --no-cache-dir -r requirements.txt
RUN pip install --no-cache-dir loguru openai sse-starlette tiktoken

CMD python VL/openai_api.py
45 changes: 32 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,35 @@ An OpenAI API compatible vision server, it functions like `gpt-4-vision-preview`
- Not affiliated with OpenAI in any way

Backend Model support:
- [X] Moondream2 [vikhyatk/moondream2](https://huggingface.co/vikhyatk/moondream2) *(only supports a single image)
- [ ] Moondream1 [vikhyatk/moondream1](https://huggingface.co/vikhyatk/moondream1) *(broken for me)
- [X] LlavaNext [llava-v1.6-mistral-7b-hf, llava-v1.6-34b-hf (llava-v1.6-34b-hf is not working well yet)](https://huggingface.co/llava-hf) *(only supports a single image)
- [X] Llava [llava-v1.5-vicuna-7b-hf, llava-v1.5-vicuna-13b-hf, llava-v1.5-bakLlava-7b-hf](https://huggingface.co/llava-hf) *(only supports a single image)
- [X] [LlavaNext](https://huggingface.co/llava-hf) - (llava-v1.6-mistral-7b-hf, llava-v1.6-34b-hf - llava-v1.6-34b-hf is not working well yet) *(only supports a single image)
- [X] [Llava](https://huggingface.co/llava-hf) - (llava-v1.5-vicuna-7b-hf, llava-v1.5-vicuna-13b-hf, llava-v1.5-bakLlava-7b-hf) *(only supports a single image)
- [X] [Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)
- [X] Moondream2 - [vikhyatk/moondream2](https://huggingface.co/vikhyatk/moondream2) *(only supports a single image)
- [ ] Moondream1 - [vikhyatk/moondream1](https://huggingface.co/vikhyatk/moondream1)
- [ ] Deepseek-VL - [deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)
- [ ] [openbmb/OmniLMM-12B](https://huggingface.co/openbmb/OmniLMM-12B)
- [ ] [echo840/Monkey](https://huggingface.co/echo840/Monkey)
- [ ] ...

Version: 0.3.0

Some vision systems include their own OpenAI compatible API server. Also included are some pre-built images and docker-compose for them:
- [X] [THUDM/CogVLM](https://github.com/THUDM/CogVLM) ([cogvlm-chat-hf](https://huggingface.co/THUDM/cogvlm-chat-hf), [cogagent-chat-hf](https://huggingface.co/THUDM/cogagent-chat-hf)), `docker-compose.cogvlm.yml` **Recommended for 16GB-40GB GPU**s
- [X] [01-ai](https://huggingface.co/01-ai)/Yi-VL ([Yi-VL-6B](https://huggingface.co/01-ai/Yi-VL-6B), [Yi-VL-34B](https://huggingface.co/01-ai/Yi-VL-34B)), `docker-compose.yi-vl.yml`

Version: 0.4.0

Recent updates:
- llava (1.5) / llavanext (1.6+) backends
- Yi-VL and CogVLM (docker containers only)
- new backend: Qwen-VL
- new backend: llava (1.5)
- new backend: llavanext (1.6+)
- multi-turn questions & answers
- chat_with_images.py test tool
- chat_with_images.py test tool and code sample
- selectable chat formats (phi15, vicuna, chatml, llama2/mistral)
- flash attention 2, accelerate, bitsandbytes (4bit, 8bit) support
- flash attention 2, accelerate (device split), bitsandbytes (4bit, 8bit) support


See: [OpenVLM Leaderboard](https://huggingface.co/spaces/opencompass/open_vlm_leaderboard)


API Documentation
Expand All @@ -36,6 +50,10 @@ Installation instructions
```shell
# install the python dependencies
pip install -r requirements.txt
# Install backend specific requirements (or select only backends you plan to use)
pip install -r requirements.moondream.txt -r requirements.qwen-vl.txt
# install the package
pip install .
# run the server
python vision.py
```
Expand All @@ -53,16 +71,16 @@ options:
-m MODEL, --model MODEL
The model to use, Ex. llava-hf/llava-v1.6-mistral-7b-hf (default: vikhyatk/moondream2)
-b BACKEND, --backend BACKEND
The backend to use (moondream1, moondream2, llavanext, llava) (default: moondream2)
The backend to use (moondream1, moondream2, llavanext, llava, qwen-vl) (default: moondream2)
-f FORMAT, --format FORMAT
Force a specific chat format. (vicuna, mistral, chatml, llama2, phi15) (default: None)
--load-in-4bit load in 4bit (default: False)
--load-in-8bit load in 8bit (default: False)
--use-flash-attn Use Flash Attention 2 (default: False)
--load-in-4bit load in 4bit (doesn't work with all models) (default: False)
--load-in-8bit load in 8bit (doesn't work with all models) (default: False)
--use-flash-attn Use Flash Attention 2 (doesn't work with all models or GPU) (default: False)
-d DEVICE, --device DEVICE
Set the torch device for the model. Ex. cuda:1 (default: auto)
-P PORT, --port PORT Server tcp port (default: 5006)
-H HOST, --host HOST Host to listen on, Ex. 0.0.0.0 (default: localhost)
-H HOST, --host HOST Host to listen on, Ex. localhost (default: 0.0.0.0)
--preload Preload model and exit. (default: False)
```

Expand All @@ -89,3 +107,4 @@ Answer: No, there are no animals visible in the picture. The focus is on the pat
Question:
```

61 changes: 61 additions & 0 deletions backend/deepseek-vl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@

print("deepseek is a WORK IN PROGRESS and doesn't work yet.")

from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM
from deepseek_vl.utils.io import load_pil_images

# specify the path to the model
# model_path = "deepseek-ai/deepseek-vl-7b-chat"

class VisionQnA(VisionQnABase):
model_name: str = "deepseek-vl"
format: str = ''

def __init__(self, model_id: str, device: str, extra_params = {}, format = None):
super().__init__(model_id, device, extra_params, format)

self.processor = VLChatProcessor.from_pretrained(model_id)
self.model = MultiModalityCausalLM.from_pretrained(**self.params)

print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")

async def chat_with_images(self, messages: list[Message], max_tokens: int) -> str:
# XXX WIP
conversation = [
{
"role": "User",
"content": "<image_placeholder>Describe each stage of this image.",
"images": ["./images/training_pipelines.jpg"]
},
{
"role": "Assistant",
"content": ""
}
]

# load images and prepare for inputs
pil_images = load_pil_images(conversation)
prepare_inputs = vl_chat_processor(
conversations=conversation,
images=pil_images,
force_batchify=True
).to(self.model.device)

# run image encoder to get the image embeddings
inputs_embeds = self.model.prepare_inputs_embeds(**prepare_inputs)

# run the model to get the response
outputs = self.model.language_model.generate(
inputs_embeds=inputs_embeds,
attention_mask=prepare_inputs.attention_mask,
pad_token_id=tokenizer.eos_token_id,
bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id,
max_new_tokens=512,
do_sample=False,
use_cache=True
)

answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
print(f"{prepare_inputs['sft_format'][0]}", answer)

Loading

0 comments on commit a733035

Please sign in to comment.