Skip to content

Commit

Permalink
0.35.0 pre
Browse files Browse the repository at this point in the history
  • Loading branch information
matatonic committed Sep 29, 2024
1 parent 6c45b53 commit 5d2252d
Show file tree
Hide file tree
Showing 10 changed files with 175 additions and 67 deletions.
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@ RUN --mount=type=cache,target=/root/.cache/pip pip install --upgrade pip
WORKDIR /app
RUN git clone https://github.com/TIGER-AI-Lab/Mantis.git --single-branch /app/Mantis
RUN git clone https://github.com/togethercomputer/Dragonfly --single-branch /app/Dragonfly
RUN git clone https://github.com/baaivision/Emu3 --single-branch /app/emu3

COPY requirements.txt .
ARG VERSION=latest
RUN if [ "$VERSION" = "alt" ]; then echo "transformers==4.41.2" >> requirements.txt; else echo "transformers>=4.45.0" >> requirements.txt ; fi
RUN if [ "$VERSION" = "alt" ]; then echo "transformers==4.41.2" >> requirements.txt; else echo "transformers>=4.45.1" >> requirements.txt ; fi
RUN --mount=type=cache,target=/root/.cache/pip pip install -U -r requirements.txt

WORKDIR /app/Mantis
Expand Down
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,13 @@ If you can't find your favorite model, you can [open a new issue](https://github

## Recent updates

Version 0.35.0

- Update Molmo (tensorflow-cpu no longer required), and add autocast to faster, smaller types than float32.
- Add `completion_tokens` counts for streamed results, other compatibility improvements
- New option: `--use-double-quant` to enable double quantization with `--load-in-4bit`, a little slower for a little less VRAM.
- Molmo 72B will now run in under 48GB of vram using `--load-in-4bit --use-double-quant`.

Version 0.34.0

- new model support: Meta-llama: Llama-3.2-11B-Vision-Instruct, Llama-3.2-90B-Vision-Instruct
Expand Down Expand Up @@ -377,7 +384,7 @@ docker compose -f docker-compose.alt.yml pull
python -m venv .venv
source .venv/bin/activate
# install the python dependencies
pip install -U -r requirements.txt "transformers>=4.45.0"
pip install -U -r requirements.txt "transformers>=4.45.1"
# OR install the python dependencies for the alt version
pip install -U -r requirements.txt "transformers==4.41.2"
# run the server with your chosen model
Expand Down
74 changes: 74 additions & 0 deletions backend/emu3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from transformers import AutoTokenizer, AutoModel, AutoImageProcessor, AutoModelForCausalLM
from transformers.generation.configuration_utils import GenerationConfig

from vision_qna import *

# WIP

# BAAI/Emu3-Gen
# BAAI/Emu3-Chat

VQ_HUB = "BAAI/Emu3-VisionTokenizer"

class VisionQnA(VisionQnABase):
model_name: str = "emu3"
format: str = "emu3"
visual_layers: List[str] = []

def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None):
super().__init__(model_id, device, device_map, extra_params, format)


self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False))
self.model = AutoModel.from_pretrained(**self.params).eval()

# bitsandbytes already moves the model to the device, so we don't need to do it again.
if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)):
self.model = self.model.to(self.device)

self.loaded_banner()

async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]:
images, prompt = await prompt_from_messages(request.messages, self.format)

inputs = self.processor(images=images, text=prompt, return_tensors="pt").to(self.model.device)

default_params = {
'do_sample': False,
# 'eos_token_id': self.processor.tokenizer.eos_token_id,
# 'pad_token_id': self.processor.tokenizer.eos_token_id,
}

params = self.get_generation_params(request, default_params=default_params)

generation_kwargs = dict(
**inputs,
**params,
)

for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.processor.tokenizer, generation_kwargs=generation_kwargs):
end = new_text.find(self.processor.tokenizer.eos_token)
if end == -1:
yield new_text
else:
yield new_text[:end]
break

async def chat_with_images(self, request: ImageChatRequest) -> str:
images, prompt = await prompt_from_messages(request.messages, self.format)

inputs = self.processor(images=images, text=prompt, return_tensors="pt").to(self.model.device)

default_params = {
'do_sample': False,
# 'eos_token_id': self.processor.tokenizer.eos_token_id,
# 'pad_token_id': self.processor.tokenizer.eos_token_id,
}

params = self.get_generation_params(request, default_params=default_params)

output = self.model.generate(**inputs, **params)
response = self.processor.tokenizer.decode(output[0][inputs.input_ids.size(1):].cpu(), skip_special_tokens=True)

return response

47 changes: 26 additions & 21 deletions backend/generic.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from transformers import AutoTokenizer, AutoModel
from transformers import AutoProcessor, AutoModel

from vision_qna import *

Expand All @@ -13,38 +13,24 @@ def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_p
if not format:
self.format = guess_model_format(model_id)

self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False))
self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False))
self.model = AutoModel.from_pretrained(**self.params).eval()

# bitsandbytes already moves the model to the device, so we don't need to do it again.
if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)):
self.model = self.model.to(self.device)

self.loaded_banner()

async def chat_with_images(self, request: ImageChatRequest) -> str:
images, prompt = await prompt_from_messages(request.messages, self.format)

inputs = self.tokenizer(prompt, images=images, return_tensors="pt").to(self.model.device)

default_params = {
'do_sample': False,
}

params = self.get_generation_params(request, default_params=default_params)

output = self.model.generate(**inputs, **params)
response = self.tokenizer.decode(output[0][inputs.input_ids.size(1):].cpu(), skip_special_tokens=True)

return response

async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGenerator[str, None]:
images, prompt = await prompt_from_messages(request.messages, self.format)

inputs = self.tokenizer(prompt, images=images, return_tensors="pt").to(self.model.device)
inputs = self.processor(images=images, text=prompt, return_tensors="pt").to(self.model.device)

default_params = {
'do_sample': False,
# 'eos_token_id': self.processor.tokenizer.eos_token_id,
# 'pad_token_id': self.processor.tokenizer.eos_token_id,
}

params = self.get_generation_params(request, default_params=default_params)
Expand All @@ -54,10 +40,29 @@ async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGener
**params,
)

for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.tokenizer, generation_kwargs=generation_kwargs):
end = new_text.find(self.tokenizer.eos_token)
for new_text in threaded_streaming_generator(generate=self.model.generate, tokenizer=self.processor.tokenizer, generation_kwargs=generation_kwargs):
end = new_text.find(self.processor.tokenizer.eos_token)
if end == -1:
yield new_text
else:
yield new_text[:end]
break

async def chat_with_images(self, request: ImageChatRequest) -> str:
images, prompt = await prompt_from_messages(request.messages, self.format)

inputs = self.processor(images=images, text=prompt, return_tensors="pt").to(self.model.device)

default_params = {
'do_sample': False,
# 'eos_token_id': self.processor.tokenizer.eos_token_id,
# 'pad_token_id': self.processor.tokenizer.eos_token_id,
}

params = self.get_generation_params(request, default_params=default_params)

output = self.model.generate(**inputs, **params)
response = self.processor.tokenizer.decode(output[0][inputs.input_ids.size(1):].cpu(), skip_special_tokens=True)

return response

23 changes: 8 additions & 15 deletions backend/molmo.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,8 @@
# allenai/Molmo-7B-D-0924
# allenai/Molmo-7B-O-0924
# allenai/Molmo-72B-0924

# XXX To use, pip install tensorflow-cpu
# https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/image_preprocessing_molmo.py#L88-L90

"""
["allenai/MolmoE-1B-0924", "-A", "flash_attention_2", "--load-in-4bit"],
["allenai/MolmoE-1B-0924", "-A", "flash_attention_2"],
["allenai/Molmo-7B-D-0924", "-A", "flash_attention_2", "--load-in-4bit"],
["allenai/Molmo-7B-D-0924", "-A", "flash_attention_2"],
["allenai/Molmo-7B-O-0924", "-A", "flash_attention_2", "--load-in-4bit"],
["allenai/Molmo-7B-O-0924", "-A", "flash_attention_2"],
["allenai/Molmo-72B-0924", "--load-in-4bit"],
"""
# cyan2k/molmo-7B-D-bnb-4bit XXX needs tensorflow-cpu
# cyan2k/molmo-7B-O-bnb-4bit XXX needs tensorflow-cpu

class VisionQnA(VisionQnABase):
model_name: str = "molmo"
Expand All @@ -28,7 +17,7 @@ class VisionQnA(VisionQnABase):
def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None):
super().__init__(model_id, device, device_map, extra_params, format)

self.dtype = self.params['torch_dtype'] = 'auto' # torch.float32
#self.dtype = self.params['torch_dtype'] = 'auto' # torch.float32

self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False))
self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval()
Expand Down Expand Up @@ -68,7 +57,11 @@ async def stream_chat_with_images(self, request: ImageChatRequest) -> AsyncGener
generation_config=GenerationConfig(**params)
)

for new_text in threaded_streaming_generator(generate=self.model.generate_from_batch, tokenizer=self.processor.tokenizer, generation_kwargs=generation_kwargs):
def wrapper(**kwargs):
with torch.amp.autocast('cuda', dtype=self.dtype):
_ = self.model.generate_from_batch(**kwargs)

for new_text in threaded_streaming_generator(generate=wrapper, tokenizer=self.processor.tokenizer, generation_kwargs=generation_kwargs):
end = new_text.find(self.processor.tokenizer.eos_token)
if end == -1:
yield new_text
Expand Down
24 changes: 12 additions & 12 deletions model_conf_tests.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
[
["AIDC-AI/Ovis1.5-Gemma2-9B", "-A", "flash_attention_2"],
["AIDC-AI/Ovis1.5-Llama3-8B", "-A", "flash_attention_2"],
["BAAI/Bunny-Llama-3-8B-V", "--load-in-4bit"],
["BAAI/Bunny-Llama-3-8B-V"],
["BAAI/Bunny-v1_0-2B-zh", "--load-in-4bit"],
["BAAI/Bunny-v1_0-2B-zh"],
["BAAI/Bunny-v1_0-3B", "--load-in-4bit"],
Expand All @@ -12,8 +10,6 @@
["BAAI/Bunny-v1_0-4B"],
["BAAI/Bunny-v1_1-4B", "--load-in-4bit"],
["BAAI/Bunny-v1_1-4B"],
["BAAI/Bunny-v1_1-Llama-3-8B-V", "--load-in-4bit"],
["BAAI/Bunny-v1_1-Llama-3-8B-V"],
["BAAI/Emu2-Chat", "--load-in-4bit"],
["BAAI/Emu2-Chat", "--max-memory=0:78GiB,1:20GiB"],
["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0", "--load-in-4bit"],
Expand Down Expand Up @@ -50,13 +46,20 @@
["THUDM/glm-4v-9b", "--device-map", "cuda:0"],
["TIGER-Lab/Mantis-8B-Fuyu", "--device-map", "cuda:0", "--load-in-4bit"],
["TIGER-Lab/Mantis-8B-Fuyu", "--device-map", "cuda:0"],
["TIGER-Lab/Mantis-8B-clip-llama3", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"],
["TIGER-Lab/Mantis-8B-clip-llama3", "-A", "flash_attention_2", "--device-map", "cuda:0"],
["TIGER-Lab/Mantis-8B-siglip-llama3", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"],
["TIGER-Lab/Mantis-8B-siglip-llama3", "-A", "flash_attention_2", "--device-map", "cuda:0"],
["adept/fuyu-8b", "--device-map", "cuda:0", "--load-in-4bit"],
["adept/fuyu-8b", "--device-map", "cuda:0"],
["echo840/Monkey-Chat", "--load-in-4bit"],
["allenai/MolmoE-1B-0924", "-A", "flash_attention_2", "--load-in-4bit", "--use-double-quant"],
["allenai/MolmoE-1B-0924", "-A", "flash_attention_2", "--load-in-4bit"],
["allenai/MolmoE-1B-0924", "-A", "flash_attention_2"],
["allenai/Molmo-7B-D-0924", "-A", "flash_attention_2", "--load-in-4bit", "--use-double-quant"],
["allenai/Molmo-7B-D-0924", "-A", "flash_attention_2", "--load-in-4bit"],
["allenai/Molmo-7B-D-0924", "-A", "flash_attention_2"],
["allenai/Molmo-7B-O-0924", "-A", "flash_attention_2", "--load-in-4bit", "--use-double-quant"],
["allenai/Molmo-7B-O-0924", "-A", "flash_attention_2", "--load-in-4bit"],
["allenai/Molmo-7B-O-0924", "-A", "flash_attention_2"],
["allenai/Molmo-72B-0924", "--load-in-4bit", "--use-double-quant"],
["allenai/Molmo-72B-0924", "--load-in-4bit"],
["echo840/Monkey-Chat", "--load-in-4bit"],
["echo840/Monkey-Chat"],
["failspy/Phi-3-vision-128k-instruct-abliterated-alpha", "-A", "flash_attention_2", "--load-in-4bit"],
["failspy/Phi-3-vision-128k-instruct-abliterated-alpha", "-A", "flash_attention_2"],
Expand Down Expand Up @@ -96,14 +99,11 @@
["mistralai/Pixtral-12B-2409"],
["mx262/MiniMonkey", "-A", "flash_attention_2", "--load-in-4bit"],
["mx262/MiniMonkey", "-A", "flash_attention_2"],
["omlab/omchat-v2.0-13B-single-beta_hf", "-A", "flash_attention_2"],
["openbmb/MiniCPM-V-2_6-int4", "-A", "flash_attention_2", "--device-map", "cuda:0"],
["openbmb/MiniCPM-V-2_6", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"],
["openbmb/MiniCPM-V-2_6", "-A", "flash_attention_2", "--device-map", "cuda:0"],
["openbmb/MiniCPM-Llama3-V-2_5", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"],
["openbmb/MiniCPM-Llama3-V-2_5", "-A", "flash_attention_2", "--device-map", "cuda:0"],
["qihoo360/360VL-8B", "-A", "flash_attention_2", "--load-in-4bit"],
["qihoo360/360VL-8B", "-A", "flash_attention_2"],
["qnguyen3/nanoLLaVA", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"],
["qnguyen3/nanoLLaVA", "-A", "flash_attention_2", "--device-map", "cuda:0"],
["qnguyen3/nanoLLaVA-1.5", "-A", "flash_attention_2", "--device-map", "cuda:0", "--load-in-4bit"],
Expand Down
8 changes: 2 additions & 6 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,8 @@ logger
git+https://github.com/LLaVA-VL/LLaVA-NeXT.git

# mistral
mistral_inference>=1.4.0
mistral_common[opencv]>=1.4.3
mistral_inference
mistral_common[opencv]

# got-ocr2
verovio

# molmo... 1GB of dependencies? wait till it's removed.
# https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/image_preprocessing_molmo.py#L88-L90
#tensorflow-cpu
4 changes: 3 additions & 1 deletion test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,9 @@ def no_image_response(prompt):
HF_HOME=hf_home
HF_HUB_ENABLE_HF_TRANSFER=1
#HF_TOKEN=hf-...
#CUDA_VISIBLE_DEVICES=1,0""", file=results_file)
#CUDA_VISIBLE_DEVICES=1,0
#OPENEDAI_DEVICE_MAP="sequential"
""", file=results_file)

for r in all_results:
cmdl = ' '.join(r['args'])
Expand Down
18 changes: 14 additions & 4 deletions vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,11 @@ def chat_streaming_chunk(content):
"object": "chat.completions.chunk",
"created": t_id,
"model": vision_qna.model_name,
#"system_fingerprint": "sk-ip",
"choices": [{
"index": 0,
"finish_reason": None,
#"logprobs": None,
"delta": {'role': 'assistant', 'content': content},
}],
}
Expand All @@ -53,10 +55,12 @@ async def streamer():
yield {"data": json.dumps(chat_streaming_chunk(''))}
logger.debug(f"sse_chunk: ['']")

# TODO: count tokens
completion_tokens = 0
prompt_tokens = 0 # XXX ignored.
skip_first_space = True
dat = ''
async for resp in vision_qna.stream_chat_with_images(request):
completion_tokens += 1 # XXX wrong if fake streaming
if skip_first_space:
skip_first_space = False
if resp[:1] == ' ':
Expand All @@ -73,9 +77,12 @@ async def streamer():
chunk = chat_streaming_chunk(dat)
chunk['choices'][0]['finish_reason'] = "stop" # XXX
chunk['usage'] = {
"prompt_tokens": 1, # XXX
"completion_tokens": 1, # XXX
"total_tokens": 1, # XXX
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"total_tokens": completion_tokens + prompt_tokens,
"completion_tokens_details": {
"reasoning_tokens": 0
}
}

yield {"data": json.dumps(chunk)}
Expand Down Expand Up @@ -126,6 +133,7 @@ def parse_args(argv=None):
parser.add_argument('--max-memory', action='store', default=None, help="(emu2 only) Set the per cuda device_map max_memory. Ex. 0:22GiB,1:22GiB,cpu:128GiB")
parser.add_argument('--no-trust-remote-code', action='store_true', help="Don't trust remote code (required for many models)")
parser.add_argument('-4', '--load-in-4bit', action='store_true', help="load in 4bit (doesn't work with all models)")
parser.add_argument('--use-double-quant', action='store_true', help="Used with --load-in-4bit for an extra memory savings, a bit slower")
parser.add_argument('-8', '--load-in-8bit', action='store_true', help="load in 8bit (doesn't work with all models)")
parser.add_argument('-F', '--use-flash-attn', action='store_true', help="DEPRECATED: use --attn_implementation flash_attention_2 or -A flash_attention_2")
parser.add_argument('-A', '--attn_implementation', default='sdpa', type=str, help="Set the attn_implementation", choices=['sdpa', 'eager', 'flash_attention_2'])
Expand Down Expand Up @@ -156,6 +164,8 @@ def parse_args(argv=None):

if args.load_in_4bit:
extra_params['load_in_4bit'] = True
if args.use_double_quant:
extra_params['4bit_use_double_quant'] = True
if args.load_in_8bit:
extra_params['load_in_8bit'] = True
if args.max_tiles:
Expand Down
Loading

0 comments on commit 5d2252d

Please sign in to comment.