0.35.0

matatonic · Sep 29, 2024 · ec48edc · ec48edc
1 parent 5d2252d
commit ec48edc
Show file tree

Hide file tree

Showing 11 changed files with 185 additions and 234 deletions.
diff --git a/README.md b/README.md
@@ -156,10 +156,11 @@ If you can't find your favorite model, you can [open a new issue](https://github
 
 Version 0.35.0
 
-- Update Molmo (tensorflow-cpu no longer required), and add autocast to faster, smaller types than float32.
-- Add `completion_tokens` counts for streamed results, other compatibility improvements
+- Update Molmo (tensorflow-cpu no longer required), and add autocast for faster, smaller types than float32.
 - New option: `--use-double-quant` to enable double quantization with `--load-in-4bit`, a little slower for a little less VRAM.
 - Molmo 72B will now run in under 48GB of vram using `--load-in-4bit --use-double-quant`.
+- Add `completion_tokens` counts and logged T/s for streamed results, other compatibility improvements
+- Include sample tokens/s data (A100) in `vision.sample.env`
 
 Version 0.34.0
 
@@ -396,8 +397,9 @@ Additional steps may be required for some models, see the Dockerfile for the lat
 ## Usage
 
 ```
-usage: vision.py [-h] -m MODEL [-b BACKEND] [-f FORMAT] [-d DEVICE] [--device-map DEVICE_MAP] [--max-memory MAX_MEMORY] [--no-trust-remote-code] [-4] [-8] [-F] [-A {sdpa,eager,flash_attention_2}] [-T MAX_TILES] [--preload]
-                 [-L {DEBUG,INFO,WARNING,ERROR,CRITICAL}] [-H HOST] [-P PORT]
+usage: vision.py [-h] -m MODEL [-b BACKEND] [-f FORMAT] [-d DEVICE] [--device-map DEVICE_MAP] [--max-memory MAX_MEMORY] [--no-trust-remote-code] [-4]
+                 [--use-double-quant] [-8] [-F] [-A {sdpa,eager,flash_attention_2}] [-T MAX_TILES] [--preload] [-L {DEBUG,INFO,WARNING,ERROR,CRITICAL}] [-H HOST]
+                 [-P PORT]
 
 OpenedAI Vision API Server
 
@@ -418,6 +420,7 @@ options:
   --no-trust-remote-code
                         Don't trust remote code (required for many models) (default: False)
   -4, --load-in-4bit    load in 4bit (doesn't work with all models) (default: False)
+  --use-double-quant    Used with --load-in-4bit for an extra memory savings, a bit slower (default: False)
   -8, --load-in-8bit    load in 8bit (doesn't work with all models) (default: False)
   -F, --use-flash-attn  DEPRECATED: use --attn_implementation flash_attention_2 or -A flash_attention_2 (default: False)
   -A {sdpa,eager,flash_attention_2}, --attn_implementation {sdpa,eager,flash_attention_2}

diff --git a/backend/deepseek-vl.py b/backend/deepseek-vl.py
diff --git a/backend/florence.py b/backend/florence.py
@@ -57,7 +57,10 @@ async def chat_with_images(self, request: ImageChatRequest) -> str:
             **params,
         )
 
+        tps_start = time.time()
         generated_ids = self.model.generate(**generation_kwargs)
+        logger.info(f"Generated {len(generated_ids[0])} tokens at {len(generated_ids[0]) / (time.time() - tps_start):0.2f} T/s")
+
         generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
         parsed_answer = self.processor.post_process_generation(generated_text, task=select_task(prompt), image_size=(images[0].width, images[0].height))
 

diff --git a/backend/generic.py b/backend/generic.py
@@ -61,8 +61,12 @@ async def chat_with_images(self, request: ImageChatRequest) -> str:
 
         params = self.get_generation_params(request, default_params=default_params)
 
+
+        tps_start = time.time()
         output = self.model.generate(**inputs, **params)
-        response = self.processor.tokenizer.decode(output[0][inputs.input_ids.size(1):].cpu(), skip_special_tokens=True)
+        out_tokens = output[0][inputs.input_ids.size(1):].cpu()
+        logger.info(f"Generated {len(out_tokens)} tokens at {len(out_tokens) / (time.time() - tps_start):0.2f} T/s")
+        response = self.processor.tokenizer.decode(out_tokens, skip_special_tokens=True)
 
         return response
 
diff --git a/backend/molmo.py b/backend/molmo.py
@@ -19,7 +19,7 @@ def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_p
 
         #self.dtype = self.params['torch_dtype'] = 'auto' # torch.float32
 
-        self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False))
+        self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False), torch_dtype=self.params['torch_dtype'], device_map=self.params['device_map'])
         self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval()
 
         # bitsandbytes already moves the model to the device, so we don't need to do it again.

diff --git a/backend/pixtral.py b/backend/pixtral.py
@@ -43,6 +43,8 @@ async def chat_with_images(self, request: ImageChatRequest) -> str:
             temperature= 0.35 if request.temperature is None else request.temperature,
         )
 
+        tps_start = time.time()
         out_tokens, _ = generate([tokenized.tokens], self.model, images=[tokenized.images], **generation_kwargs)
+        logger.info(f"Generated {len(out_tokens[0])} tokens at {len(out_tokens[0]) / (time.time() - tps_start):0.2f} T/s")
 
         return self.tokenizer.decode(out_tokens[0])
diff --git a/model_conf_tests.json b/model_conf_tests.json
@@ -42,8 +42,6 @@
   ["Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5"],
   ["Salesforce/xgen-mm-phi3-mini-instruct-singleimg-r-v1.5"],
   ["Salesforce/xgen-mm-phi3-mini-instruct-r-v1"],
-  ["THUDM/glm-4v-9b", "--device-map", "cuda:0", "--load-in-4bit"],
-  ["THUDM/glm-4v-9b", "--device-map", "cuda:0"],
   ["TIGER-Lab/Mantis-8B-Fuyu", "--device-map", "cuda:0", "--load-in-4bit"],
   ["TIGER-Lab/Mantis-8B-Fuyu", "--device-map", "cuda:0"],
   ["adept/fuyu-8b", "--device-map", "cuda:0", "--load-in-4bit"],
@@ -57,9 +55,9 @@
   ["allenai/Molmo-7B-O-0924", "-A", "flash_attention_2", "--load-in-4bit", "--use-double-quant"],
   ["allenai/Molmo-7B-O-0924", "-A", "flash_attention_2", "--load-in-4bit"],
   ["allenai/Molmo-7B-O-0924", "-A", "flash_attention_2"],
-  ["allenai/Molmo-72B-0924", "--load-in-4bit", "--use-double-quant"],
-  ["allenai/Molmo-72B-0924", "--load-in-4bit"],
-["echo840/Monkey-Chat", "--load-in-4bit"],
+  ["allenai/Molmo-72B-0924", "-A", "flash_attention_2", "--load-in-4bit", "--use-double-quant"],
+  ["allenai/Molmo-72B-0924", "-A", "flash_attention_2", "--load-in-4bit"],
+  ["echo840/Monkey-Chat", "--load-in-4bit"],
   ["echo840/Monkey-Chat"],
   ["failspy/Phi-3-vision-128k-instruct-abliterated-alpha", "-A", "flash_attention_2", "--load-in-4bit"],
   ["failspy/Phi-3-vision-128k-instruct-abliterated-alpha", "-A", "flash_attention_2"],

diff --git a/test_models.py b/test_models.py
@@ -114,23 +114,31 @@ def test(cmd_args: list[str]) -> int:
     t = time.time()
 
     try:
-        results = single_round()
+        results, timing = single_round()
     except Exception as e:
         traceback.print_exc()
         note = f'Test failed with Exception: {e}'
         print(f"{note}")
         results = [False]
+        timing = []
 
     t = time.time() - t
 
     mem = get_total_gpu_mem_used()
 
     result = all(results)
     if not note:
-        note = f'{results.count(True)}/{len(results)} tests passed.'
+        note = f'{results.count(True)}/{len(results)} tests passed'
+        if timing:
+            tok_total, tim_total = 0, 0.0
+            for tok, tim in timing:
+                if tok > 1 and tim > 0:
+                    tok_total += tok
+                    tim_total += tim
+            if tim_total > 0.0:
+                note += f', {tok_total/tim_total:0.1f} T/s ({tok_total}/{tim_total:0.1f}s)'
 
     print(f"\n\n###\n\nTest complete.\nResult: {green_pass if result else red_fail}, time: {t:.1f}s")
-
 
     record_result(cmd_args, results, t, mem, note)
 
@@ -176,7 +184,6 @@ def test(cmd_args: list[str]) -> int:
         params['top_p'] = args.top_p
 
     def generate_response(image_url, prompt):
-
         messages = [{ "role": "system", "content": [{ 'type': 'text', 'text': args.system_prompt }] }] if args.system_prompt else []
         messages.extend([
             { "role": "user", "content": [
@@ -186,10 +193,10 @@ def generate_response(image_url, prompt):
 
         response = client.chat.completions.create(model="gpt-4-vision-preview", messages=messages, **params)
         answer = response.choices[0].message.content
-        return answer
+        tok = response.usage.completion_tokens
+        return answer, tok
 
     def generate_stream_response(image_url, prompt):
-
         messages = [{ "role": "system", "content": [{ 'type': 'text', 'text': args.system_prompt }] }] if args.system_prompt else []
         messages.extend([
             { "role": "user", "content": [
@@ -199,50 +206,44 @@ def generate_stream_response(image_url, prompt):
 
         response = client.chat.completions.create(model="gpt-4-vision-preview", messages=messages, **params, stream=True)
         answer = ''
+        completion_tokens = 0
         for chunk in response:
             if chunk.choices[0].delta.content:
                 answer += chunk.choices[0].delta.content
+            if chunk.usage:
+                completion_tokens = chunk.usage.completion_tokens
 
-        return answer
+        return answer, completion_tokens
 
     def single_round():
         # XXX TODO: timeout
         results = []
         ### Single round
+        timing = []
 
-        # url tests
-        for name, url in urls.items():
-            answer = generate_response(url, "What is the subject of the image?")
+        def single_test(url, question, label, generator=generate_response):
+            tps_time = time.time()
+            answer, tok = generator(url, question)
+            tps_time = time.time() - tps_time
             correct = name in answer.lower()
             results.extend([correct])
             if not correct:
-                print(f"{name}[url]: fail, got: {answer}")
-                if args.abort_on_fail:
-                    break
+                print(f"{name}[{label}]: fail, got: {answer}")
+                #if args.abort_on_fail:
+                #    break
             else:
-                print(f"{name}[url]: pass{', got: ' + answer if args.verbose else ''}")
+                print(f"{name}[{label}]: pass{', got: ' + answer if args.verbose else ''}")
+            if tok > 1:
+                timing.extend([(tok, tps_time)])
 
-            data_url = data_url_from_url(url)
-            answer = generate_response(data_url, "What is the subject of the image?")
-            correct = name in answer.lower()
-            results.extend([correct])
-            if not correct:
-                print(f"{name}[data]: fail, got: {answer}")
-                if args.abort_on_fail:
-                    break
-            else:
-                print(f"{name}[data]: pass{', got: ' + answer if args.verbose else ''}")
 
-            answer = generate_stream_response(data_url, "What is the subject of the image?")
-            correct = name in answer.lower()
-            results.extend([correct])
-            if not correct:
-                print(f"{name}[data_stream]: fail, got: {answer}")
-                if args.abort_on_fail:
-                    break
-            else:
-                print(f"{name}[data_stream]: pass{', got: ' + answer if args.verbose else ''}")
+        # url tests
+        for name, url in urls.items():
+            single_test(url, "What is the subject of the image?", "url", generate_response)
 
+            data_url = data_url_from_url(url)
+            single_test(data_url, "What is the subject of the image?", "data", generate_response)
+            single_test(data_url, "What is the subject of the image?", "data_stream", generate_stream_response)
 
         """
         ## OCR tests
@@ -252,15 +253,7 @@ def single_round():
         }
         for name, question in quality_urls.items():
             prompt, data_url = question
-            answer = generate_stream_response(data_url, prompt)
-            correct = name in answer.lower() or 'wal-mart' in answer.lower()
-            results.extend([correct])
-            if not correct:
-                print(f"{name}[quality]: fail, got: {answer}")
-                if args.abort_on_fail:
-                    break
-            else:
-                print(f"{name}[quality]: pass{', got: ' + answer if args.verbose else ''}")
+            single_test(data_url, prompt, "quality", generate_stream_response)
         """
 
         # No image tests
@@ -287,7 +280,7 @@ def no_image_response(prompt):
             else:
                 print(f"{name}[no_img]: pass{', got: ' + answer if args.verbose else ''}")
 
-        return results
+        return results, timing
 
     with open('model_conf_tests.json') as f:
         model_tests = json.load(f)

diff --git a/vision.py b/vision.py
@@ -55,12 +55,13 @@ async def streamer():
             yield {"data": json.dumps(chat_streaming_chunk(''))}
             logger.debug(f"sse_chunk: ['']")
 
+            tps_start = time.time()
             completion_tokens = 0
             prompt_tokens = 0 # XXX ignored.
             skip_first_space = True
             dat = ''
             async for resp in vision_qna.stream_chat_with_images(request):
-                completion_tokens += 1 # XXX wrong if fake streaming
+                completion_tokens += 1
                 if skip_first_space:
                     skip_first_space = False
                     if resp[:1] == ' ':
@@ -85,6 +86,8 @@ async def streamer():
                 }
             }
 
+            logger.info(f"Generated {completion_tokens} tokens at {completion_tokens / (time.time() - tps_start):0.2f} T/s")
+
             yield {"data": json.dumps(chunk)}
             logger.debug(f"sse_chunk: {[dat]} + ['DONE']")