Fixes

IBM · Mar 11, 2024 · ab322bb · ab322bb
1 parent 1810585
commit ab322bb
Show file tree

Hide file tree

Showing 5 changed files with 200 additions and 28 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -98,6 +98,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     pip install accelerate
 
 COPY --from=build /workspace/vllm/*.so /workspace/vllm/
+COPY --from=build /workspace/vllm/thirdparty_files /workspace/vllm/thirdparty_files
 COPY vllm vllm
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

diff --git a/benchmarks/kernels/benchmark_mixtral_moe.py b/benchmarks/kernels/benchmark_mixtral_moe.py
@@ -12,22 +12,39 @@
 
 def main():
     method = fused_moe
-    for bs in [
-            1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
-            2048, 3072, 4096
-    ]:
-        run_grid(bs, method=method)
-
-
-def run_grid(bs, method):
     d_model = 4096
     num_total_experts = 8
     top_k = 2
     tp_size = 2
     model_intermediate_size = 14336
     num_layers = 32
-    num_calls = 100
+    best_configs = {}
 
+    for bs in [
+            1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
+            2048, 3072, 4096
+    ]:
+        best_configs.update(
+            run_grid(bs=bs,
+                     method=method,
+                     d_model=d_model,
+                     num_total_experts=num_total_experts,
+                     top_k=top_k,
+                     tp_size=tp_size,
+                     model_intermediate_size=model_intermediate_size,
+                     num_layers=num_layers))
+
+    device_name = torch.cuda.get_device_name().replace(" ", "_")
+    filename = f"E={num_total_experts},N={model_intermediate_size//tp_size},device_name={device_name}.json"
+    print(f"writing combined configs to file {filename}")
+    with open(filename, 'w') as fd:
+        json.dump(best_configs, fd, indent=4)
+
+
+def run_grid(bs: int, method, d_model: int, num_total_experts: int, top_k: int,
+             tp_size: int, model_intermediate_size: int,
+             num_layers: int) -> float:
+    num_calls = 100
     num_warmup_trials = 1
     num_trials = 1
 
@@ -64,7 +81,7 @@ def run_grid(bs, method):
         print(f'{tp_size=} {bs=}')
         print(f'{config}')
         # warmup
-        print(f'warming up')
+        print('warming up')
         try:
             for _ in range(num_warmup_trials):
                 run_timing(
@@ -82,7 +99,7 @@ def run_grid(bs, method):
             continue
 
         # trial
-        print(f'benchmarking')
+        print('benchmarking')
         for _ in range(num_trials):
             kernel_dur_ms = run_timing(
                 num_calls=num_calls,
@@ -109,11 +126,14 @@ def run_grid(bs, method):
 
     print("best_time_us", best_time_us)
     print("best_config", best_config)
+    bs_best_config = {str(bs): best_config}
 
     filename = "/tmp/config.jsonl"
     print(f"writing config to file {filename}")
     with open(filename, "a") as f:
-        f.write(json.dumps({str(bs): best_config}) + "\n")
+        f.write(json.dumps(bs_best_config) + "\n")
+
+    return bs_best_config
 
 
 def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,

diff --git a/vllm/entrypoints/grpc/grpc_server.py b/vllm/entrypoints/grpc/grpc_server.py
@@ -11,7 +11,6 @@
 from grpc.aio import ServicerContext
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
-from vllm.transformers_utils.tokenizer import TokenizerGroup
 from vllm.logger import init_logger
 from vllm.config import ModelConfig
 from vllm.entrypoints.grpc.pb import generation_pb2_grpc
@@ -21,6 +20,8 @@
 from vllm.entrypoints.openai.serving_completion import merge_async_iterators
 from vllm.sampling_params import LogitsProcessor
 from vllm.tgis_utils.logits_processors import MinTokensLogitsProcessor, TypicalLogitsWarperWrapper
+from vllm.transformers_utils.tokenizer import TokenizerGroup
+from vllm.sequence import Logprob
 from vllm import AsyncLLMEngine, SamplingParams, RequestOutput, CompletionOutput
 
 logger = init_logger(__name__)
@@ -398,7 +399,7 @@ def _convert_reason(output: CompletionOutput, max_is_token_limit: bool,
     def _convert_tokens(
         self,
         token_ids: list[int],
-        logprobs_list: Optional[list[Dict[int, float]]],
+        logprobs_list: Optional[list[Dict[int, Logprob]]],
         include_logprobs: bool,
         top_n_tokens: int,
         token_infos: MutableSequence[TokenInfo],  # OUT
@@ -414,20 +415,23 @@ def _convert_tokens(
             token_info = TokenInfo(text=text)
             if logprobs_list is not None:
                 logprobs = logprobs_list[i]
-                if include_logprobs:
-                    token_info.logprob = logprobs[token_ids[i]]
-                if top_n_tokens:
-                    items = sorted(logprobs.items(),
-                                   key=lambda item: item[1],
-                                   reverse=True)[:top_n_tokens]
-                    #TODO later use get_lora_tokenizer here
-                    tt_texts = self.tokenizer.convert_ids_to_tokens(
-                        [tid for tid, _ in items])
-                    token_info.top_tokens.extend(
-                        TokenInfo.TopToken(
-                            text=tt_text,
-                            logprob=logprob,
-                        ) for tt_text, (_, logprob) in zip(tt_texts, items))
+                # Logprobs entry will be None for first prompt token
+                if logprobs is not None:
+                    if include_logprobs:
+                        token_info.logprob = logprobs[token_ids[i]].logprob
+                    if top_n_tokens:
+                        items = sorted(logprobs.items(),
+                                       key=lambda item: item[1].logprob,
+                                       reverse=True)[:top_n_tokens]
+                        #TODO later use get_lora_tokenizer here
+                        tt_texts = self.tokenizer.convert_ids_to_tokens(
+                            [tid for tid, _ in items])
+                        token_info.top_tokens.extend(
+                            TokenInfo.TopToken(
+                                text=tt_text,
+                                logprob=logprob.logprob,
+                            )
+                            for tt_text, (_, logprob) in zip(tt_texts, items))
             token_infos.append(token_info)
 
     async def _validate_prompt_and_tokenize(

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
@@ -175,6 +175,7 @@ async def validation_exception_handler(_, exc):
 @app.get("/health")
 async def health() -> Response:
     """Health check."""
+    await openai_serving_chat.engine.check_health()
     return Response(status_code=200)
 
 

diff --git a/...model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json b/...model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+      "BLOCK_SIZE_M": 16,
+      "BLOCK_SIZE_N": 128,
+      "BLOCK_SIZE_K": 128,
+      "GROUP_SIZE_M": 64,
+      "num_warps": 4,
+      "num_stages": 4
+    },
+    "2": {
+      "BLOCK_SIZE_M": 16,
+      "BLOCK_SIZE_N": 64,
+      "BLOCK_SIZE_K": 256,
+      "GROUP_SIZE_M": 64,
+      "num_warps": 4,
+      "num_stages": 4
+    },
+    "4": {
+      "BLOCK_SIZE_M": 16,
+      "BLOCK_SIZE_N": 64,
+      "BLOCK_SIZE_K": 128,
+      "GROUP_SIZE_M": 32,
+      "num_warps": 4,
+      "num_stages": 4
+    },
+    "8": {
+      "BLOCK_SIZE_M": 16,
+      "BLOCK_SIZE_N": 64,
+      "BLOCK_SIZE_K": 128,
+      "GROUP_SIZE_M": 32,
+      "num_warps": 4,
+      "num_stages": 4
+    },
+    "16": {
+      "BLOCK_SIZE_M": 16,
+      "BLOCK_SIZE_N": 64,
+      "BLOCK_SIZE_K": 256,
+      "GROUP_SIZE_M": 1,
+      "num_warps": 4,
+      "num_stages": 4
+    },
+    "24": {
+      "BLOCK_SIZE_M": 16,
+      "BLOCK_SIZE_N": 128,
+      "BLOCK_SIZE_K": 128,
+      "GROUP_SIZE_M": 1,
+      "num_warps": 4,
+      "num_stages": 4
+    },
+    "32": {
+      "BLOCK_SIZE_M": 16,
+      "BLOCK_SIZE_N": 128,
+      "BLOCK_SIZE_K": 128,
+      "GROUP_SIZE_M": 1,
+      "num_warps": 4,
+      "num_stages": 4
+    },
+    "48": {
+      "BLOCK_SIZE_M": 32,
+      "BLOCK_SIZE_N": 128,
+      "BLOCK_SIZE_K": 128,
+      "GROUP_SIZE_M": 1,
+      "num_warps": 4,
+      "num_stages": 4
+    },
+    "64": {
+      "BLOCK_SIZE_M": 32,
+      "BLOCK_SIZE_N": 128,
+      "BLOCK_SIZE_K": 128,
+      "GROUP_SIZE_M": 1,
+      "num_warps": 4,
+      "num_stages": 4
+    },
+    "96": {
+      "BLOCK_SIZE_M": 32,
+      "BLOCK_SIZE_N": 128,
+      "BLOCK_SIZE_K": 128,
+      "GROUP_SIZE_M": 64,
+      "num_warps": 4,
+      "num_stages": 4
+    },
+    "128": {
+      "BLOCK_SIZE_M": 32,
+      "BLOCK_SIZE_N": 128,
+      "BLOCK_SIZE_K": 128,
+      "GROUP_SIZE_M": 16,
+      "num_warps": 4,
+      "num_stages": 4
+    },
+    "256": {
+      "BLOCK_SIZE_M": 64,
+      "BLOCK_SIZE_N": 256,
+      "BLOCK_SIZE_K": 64,
+      "GROUP_SIZE_M": 32,
+      "num_warps": 8,
+      "num_stages": 4
+    },
+    "512": {
+      "BLOCK_SIZE_M": 64,
+      "BLOCK_SIZE_N": 256,
+      "BLOCK_SIZE_K": 64,
+      "GROUP_SIZE_M": 32,
+      "num_warps": 8,
+      "num_stages": 4
+    },
+    "1024": {
+      "BLOCK_SIZE_M": 64,
+      "BLOCK_SIZE_N": 256,
+      "BLOCK_SIZE_K": 64,
+      "GROUP_SIZE_M": 64,
+      "num_warps": 8,
+      "num_stages": 4
+    },
+    "1536": {
+      "BLOCK_SIZE_M": 128,
+      "BLOCK_SIZE_N": 128,
+      "BLOCK_SIZE_K": 64,
+      "GROUP_SIZE_M": 64,
+      "num_warps": 8,
+      "num_stages": 4
+    },
+    "2048": {
+      "BLOCK_SIZE_M": 128,
+      "BLOCK_SIZE_N": 128,
+      "BLOCK_SIZE_K": 64,
+      "GROUP_SIZE_M": 16,
+      "num_warps": 8,
+      "num_stages": 4
+    },
+    "3072": {
+      "BLOCK_SIZE_M": 128,
+      "BLOCK_SIZE_N": 128,
+      "BLOCK_SIZE_K": 64,
+      "GROUP_SIZE_M": 16,
+      "num_warps": 8,
+      "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}