Skip to content

Commit

Permalink
0.35.0
Browse files Browse the repository at this point in the history
  • Loading branch information
matatonic committed Sep 29, 2024
1 parent 5d2252d commit ec48edc
Show file tree
Hide file tree
Showing 11 changed files with 185 additions and 234 deletions.
11 changes: 7 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -156,10 +156,11 @@ If you can't find your favorite model, you can [open a new issue](https://github

Version 0.35.0

- Update Molmo (tensorflow-cpu no longer required), and add autocast to faster, smaller types than float32.
- Add `completion_tokens` counts for streamed results, other compatibility improvements
- Update Molmo (tensorflow-cpu no longer required), and add autocast for faster, smaller types than float32.
- New option: `--use-double-quant` to enable double quantization with `--load-in-4bit`, a little slower for a little less VRAM.
- Molmo 72B will now run in under 48GB of vram using `--load-in-4bit --use-double-quant`.
- Add `completion_tokens` counts and logged T/s for streamed results, other compatibility improvements
- Include sample tokens/s data (A100) in `vision.sample.env`

Version 0.34.0

Expand Down Expand Up @@ -396,8 +397,9 @@ Additional steps may be required for some models, see the Dockerfile for the lat
## Usage

```
usage: vision.py [-h] -m MODEL [-b BACKEND] [-f FORMAT] [-d DEVICE] [--device-map DEVICE_MAP] [--max-memory MAX_MEMORY] [--no-trust-remote-code] [-4] [-8] [-F] [-A {sdpa,eager,flash_attention_2}] [-T MAX_TILES] [--preload]
[-L {DEBUG,INFO,WARNING,ERROR,CRITICAL}] [-H HOST] [-P PORT]
usage: vision.py [-h] -m MODEL [-b BACKEND] [-f FORMAT] [-d DEVICE] [--device-map DEVICE_MAP] [--max-memory MAX_MEMORY] [--no-trust-remote-code] [-4]
[--use-double-quant] [-8] [-F] [-A {sdpa,eager,flash_attention_2}] [-T MAX_TILES] [--preload] [-L {DEBUG,INFO,WARNING,ERROR,CRITICAL}] [-H HOST]
[-P PORT]
OpenedAI Vision API Server
Expand All @@ -418,6 +420,7 @@ options:
--no-trust-remote-code
Don't trust remote code (required for many models) (default: False)
-4, --load-in-4bit load in 4bit (doesn't work with all models) (default: False)
--use-double-quant Used with --load-in-4bit for an extra memory savings, a bit slower (default: False)
-8, --load-in-8bit load in 8bit (doesn't work with all models) (default: False)
-F, --use-flash-attn DEPRECATED: use --attn_implementation flash_attention_2 or -A flash_attention_2 (default: False)
-A {sdpa,eager,flash_attention_2}, --attn_implementation {sdpa,eager,flash_attention_2}
Expand Down
61 changes: 0 additions & 61 deletions backend/deepseek-vl.py

This file was deleted.

3 changes: 3 additions & 0 deletions backend/florence.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,10 @@ async def chat_with_images(self, request: ImageChatRequest) -> str:
**params,
)

tps_start = time.time()
generated_ids = self.model.generate(**generation_kwargs)
logger.info(f"Generated {len(generated_ids[0])} tokens at {len(generated_ids[0]) / (time.time() - tps_start):0.2f} T/s")

generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
parsed_answer = self.processor.post_process_generation(generated_text, task=select_task(prompt), image_size=(images[0].width, images[0].height))

Expand Down
6 changes: 5 additions & 1 deletion backend/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,12 @@ async def chat_with_images(self, request: ImageChatRequest) -> str:

params = self.get_generation_params(request, default_params=default_params)


tps_start = time.time()
output = self.model.generate(**inputs, **params)
response = self.processor.tokenizer.decode(output[0][inputs.input_ids.size(1):].cpu(), skip_special_tokens=True)
out_tokens = output[0][inputs.input_ids.size(1):].cpu()
logger.info(f"Generated {len(out_tokens)} tokens at {len(out_tokens) / (time.time() - tps_start):0.2f} T/s")
response = self.processor.tokenizer.decode(out_tokens, skip_special_tokens=True)

return response

2 changes: 1 addition & 1 deletion backend/molmo.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_p

#self.dtype = self.params['torch_dtype'] = 'auto' # torch.float32

self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False))
self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False), torch_dtype=self.params['torch_dtype'], device_map=self.params['device_map'])
self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval()

# bitsandbytes already moves the model to the device, so we don't need to do it again.
Expand Down
2 changes: 2 additions & 0 deletions backend/pixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ async def chat_with_images(self, request: ImageChatRequest) -> str:
temperature= 0.35 if request.temperature is None else request.temperature,
)

tps_start = time.time()
out_tokens, _ = generate([tokenized.tokens], self.model, images=[tokenized.images], **generation_kwargs)
logger.info(f"Generated {len(out_tokens[0])} tokens at {len(out_tokens[0]) / (time.time() - tps_start):0.2f} T/s")

return self.tokenizer.decode(out_tokens[0])
8 changes: 3 additions & 5 deletions model_conf_tests.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,6 @@
["Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5"],
["Salesforce/xgen-mm-phi3-mini-instruct-singleimg-r-v1.5"],
["Salesforce/xgen-mm-phi3-mini-instruct-r-v1"],
["THUDM/glm-4v-9b", "--device-map", "cuda:0", "--load-in-4bit"],
["THUDM/glm-4v-9b", "--device-map", "cuda:0"],
["TIGER-Lab/Mantis-8B-Fuyu", "--device-map", "cuda:0", "--load-in-4bit"],
["TIGER-Lab/Mantis-8B-Fuyu", "--device-map", "cuda:0"],
["adept/fuyu-8b", "--device-map", "cuda:0", "--load-in-4bit"],
Expand All @@ -57,9 +55,9 @@
["allenai/Molmo-7B-O-0924", "-A", "flash_attention_2", "--load-in-4bit", "--use-double-quant"],
["allenai/Molmo-7B-O-0924", "-A", "flash_attention_2", "--load-in-4bit"],
["allenai/Molmo-7B-O-0924", "-A", "flash_attention_2"],
["allenai/Molmo-72B-0924", "--load-in-4bit", "--use-double-quant"],
["allenai/Molmo-72B-0924", "--load-in-4bit"],
["echo840/Monkey-Chat", "--load-in-4bit"],
["allenai/Molmo-72B-0924", "-A", "flash_attention_2", "--load-in-4bit", "--use-double-quant"],
["allenai/Molmo-72B-0924", "-A", "flash_attention_2", "--load-in-4bit"],
["echo840/Monkey-Chat", "--load-in-4bit"],
["echo840/Monkey-Chat"],
["failspy/Phi-3-vision-128k-instruct-abliterated-alpha", "-A", "flash_attention_2", "--load-in-4bit"],
["failspy/Phi-3-vision-128k-instruct-abliterated-alpha", "-A", "flash_attention_2"],
Expand Down
79 changes: 36 additions & 43 deletions test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,23 +114,31 @@ def test(cmd_args: list[str]) -> int:
t = time.time()

try:
results = single_round()
results, timing = single_round()
except Exception as e:
traceback.print_exc()
note = f'Test failed with Exception: {e}'
print(f"{note}")
results = [False]
timing = []

t = time.time() - t

mem = get_total_gpu_mem_used()

result = all(results)
if not note:
note = f'{results.count(True)}/{len(results)} tests passed.'
note = f'{results.count(True)}/{len(results)} tests passed'
if timing:
tok_total, tim_total = 0, 0.0
for tok, tim in timing:
if tok > 1 and tim > 0:
tok_total += tok
tim_total += tim
if tim_total > 0.0:
note += f', {tok_total/tim_total:0.1f} T/s ({tok_total}/{tim_total:0.1f}s)'

print(f"\n\n###\n\nTest complete.\nResult: {green_pass if result else red_fail}, time: {t:.1f}s")


record_result(cmd_args, results, t, mem, note)

Expand Down Expand Up @@ -176,7 +184,6 @@ def test(cmd_args: list[str]) -> int:
params['top_p'] = args.top_p

def generate_response(image_url, prompt):

messages = [{ "role": "system", "content": [{ 'type': 'text', 'text': args.system_prompt }] }] if args.system_prompt else []
messages.extend([
{ "role": "user", "content": [
Expand All @@ -186,10 +193,10 @@ def generate_response(image_url, prompt):

response = client.chat.completions.create(model="gpt-4-vision-preview", messages=messages, **params)
answer = response.choices[0].message.content
return answer
tok = response.usage.completion_tokens
return answer, tok

def generate_stream_response(image_url, prompt):

messages = [{ "role": "system", "content": [{ 'type': 'text', 'text': args.system_prompt }] }] if args.system_prompt else []
messages.extend([
{ "role": "user", "content": [
Expand All @@ -199,50 +206,44 @@ def generate_stream_response(image_url, prompt):

response = client.chat.completions.create(model="gpt-4-vision-preview", messages=messages, **params, stream=True)
answer = ''
completion_tokens = 0
for chunk in response:
if chunk.choices[0].delta.content:
answer += chunk.choices[0].delta.content
if chunk.usage:
completion_tokens = chunk.usage.completion_tokens

return answer
return answer, completion_tokens

def single_round():
# XXX TODO: timeout
results = []
### Single round
timing = []

# url tests
for name, url in urls.items():
answer = generate_response(url, "What is the subject of the image?")
def single_test(url, question, label, generator=generate_response):
tps_time = time.time()
answer, tok = generator(url, question)
tps_time = time.time() - tps_time
correct = name in answer.lower()
results.extend([correct])
if not correct:
print(f"{name}[url]: fail, got: {answer}")
if args.abort_on_fail:
break
print(f"{name}[{label}]: fail, got: {answer}")
#if args.abort_on_fail:
# break
else:
print(f"{name}[url]: pass{', got: ' + answer if args.verbose else ''}")
print(f"{name}[{label}]: pass{', got: ' + answer if args.verbose else ''}")
if tok > 1:
timing.extend([(tok, tps_time)])

data_url = data_url_from_url(url)
answer = generate_response(data_url, "What is the subject of the image?")
correct = name in answer.lower()
results.extend([correct])
if not correct:
print(f"{name}[data]: fail, got: {answer}")
if args.abort_on_fail:
break
else:
print(f"{name}[data]: pass{', got: ' + answer if args.verbose else ''}")

answer = generate_stream_response(data_url, "What is the subject of the image?")
correct = name in answer.lower()
results.extend([correct])
if not correct:
print(f"{name}[data_stream]: fail, got: {answer}")
if args.abort_on_fail:
break
else:
print(f"{name}[data_stream]: pass{', got: ' + answer if args.verbose else ''}")
# url tests
for name, url in urls.items():
single_test(url, "What is the subject of the image?", "url", generate_response)

data_url = data_url_from_url(url)
single_test(data_url, "What is the subject of the image?", "data", generate_response)
single_test(data_url, "What is the subject of the image?", "data_stream", generate_stream_response)

"""
## OCR tests
Expand All @@ -252,15 +253,7 @@ def single_round():
}
for name, question in quality_urls.items():
prompt, data_url = question
answer = generate_stream_response(data_url, prompt)
correct = name in answer.lower() or 'wal-mart' in answer.lower()
results.extend([correct])
if not correct:
print(f"{name}[quality]: fail, got: {answer}")
if args.abort_on_fail:
break
else:
print(f"{name}[quality]: pass{', got: ' + answer if args.verbose else ''}")
single_test(data_url, prompt, "quality", generate_stream_response)
"""

# No image tests
Expand All @@ -287,7 +280,7 @@ def no_image_response(prompt):
else:
print(f"{name}[no_img]: pass{', got: ' + answer if args.verbose else ''}")

return results
return results, timing

with open('model_conf_tests.json') as f:
model_tests = json.load(f)
Expand Down
5 changes: 4 additions & 1 deletion vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,13 @@ async def streamer():
yield {"data": json.dumps(chat_streaming_chunk(''))}
logger.debug(f"sse_chunk: ['']")

tps_start = time.time()
completion_tokens = 0
prompt_tokens = 0 # XXX ignored.
skip_first_space = True
dat = ''
async for resp in vision_qna.stream_chat_with_images(request):
completion_tokens += 1 # XXX wrong if fake streaming
completion_tokens += 1
if skip_first_space:
skip_first_space = False
if resp[:1] == ' ':
Expand All @@ -85,6 +86,8 @@ async def streamer():
}
}

logger.info(f"Generated {completion_tokens} tokens at {completion_tokens / (time.time() - tps_start):0.2f} T/s")

yield {"data": json.dumps(chunk)}
logger.debug(f"sse_chunk: {[dat]} + ['DONE']")

Expand Down
Loading

0 comments on commit ec48edc

Please sign in to comment.