project-etalon · anmolagarwalcp810 · Aug 30, 2024 · Aug 30, 2024 · Aug 30, 2024 · Aug 30, 2024
diff --git a/README.md b/README.md
@@ -1,9 +1,9 @@
-<!-- <p align="center">
+<p align="center">
   <picture>
     <source media="(prefers-color-scheme: dark)" srcset="docs/_static/logo/dark.png">
     <img alt="vLLM" src="docs/_static/logo/light.png" width=50%>
   </picture>
-</p> -->
+</p>
 
 <h3 align="center">
 Tool to benchmark LLM Inference Systems

diff --git a/docs/_static/logo/dark.png b/docs/_static/logo/dark.png
diff --git a/docs/_static/logo/light.png b/docs/_static/logo/light.png
diff --git a/etalon/capacity_search/config/config.py b/etalon/capacity_search/config/config.py
@@ -33,20 +33,24 @@ def to_config_dict(self):
 class ModelConfig:
     name: str
     identifier: str
+    tokenizer: str = None
     parallel_specs: List[str] = field(default_factory=list)
     traces: List[str] = field(default_factory=list)
 
     def get_key(self):
         return f"{self.name}"
 
     def get_human_readable_name(self):
-        return f"Model: {self.name}"
+        return f"Model: {self.name}, Tokenizer: {self.tokenizer}"
 
     def to_config_dict(self):
-        return {"model_name": self.identifier}
+        return {"model_name": self.identifier, "tokenizer_name": self.tokenizer}
 
     def to_args(self):
-        return f"--model {self.identifier}"
+        command = f"--model {self.identifier}"
+        if self.tokenizer:
+            command += f" --tokenizer {self.tokenizer}"
+        return command
 
     def is_parallel_spec_valid(self, spec_name: str) -> bool:
         return not self.parallel_specs or spec_name in self.parallel_specs
@@ -312,10 +316,6 @@ def generate_job_configs(cls, config: dict):
                     and server_config.openai_server_engine
                     in ["vllm", "lightllm", "fastertransformers", "sarathi-serve"]
                 )
-                or (
-                    model_config.name != "gpt-3.5-turbo"
-                    and server_config.openai_server_engine == "default"
-                )
                 or (
                     request_generator_config.trace_file_name == "sharegpt"
                     and request_config.request_generator_max_tokens == 16384

diff --git a/etalon/core/hf_utils.py b/etalon/core/hf_utils.py
@@ -26,7 +26,7 @@ def get_tokenizer(
         )
     except TypeError as e:
         # The LLaMA tokenizer causes a protobuf error in some environments.
-        err_msg = "Failed to load the tokenizer."
+        err_msg = "Failed to load the tokenizer. If model name is correct, consider setting --tokenizer CLI arg to equivalent model on HuggingFace."
         raise RuntimeError(err_msg) from e
     except ValueError as e:
         # If the error pertains to the tokenizer class not existing or not

diff --git a/etalon/core/llm_clients/__init__.py b/etalon/core/llm_clients/__init__.py
@@ -10,7 +10,11 @@
 
 
 def construct_clients(
-    model_name: str, llm_api: str, num_clients: int, use_ray: bool = True
+    model_name: str,
+    tokenizer_name: str,
+    llm_api: str,
+    num_clients: int,
+    use_ray: bool = True,
 ) -> List[BaseLLMClient]:
     """Construct LLMClients that will be used to make requests to the LLM API.
 
@@ -36,8 +40,8 @@ def construct_clients(
         )
 
     if use_ray:
-        clients = [impl.remote(model_name) for _ in range(num_clients)]
+        clients = [impl.remote(model_name, tokenizer_name) for _ in range(num_clients)]
     else:
-        clients = [impl(model_name) for _ in range(num_clients)]
+        clients = [impl(model_name, tokenizer_name) for _ in range(num_clients)]
 
     return clients
diff --git a/etalon/core/llm_clients/base_llm_client.py b/etalon/core/llm_clients/base_llm_client.py
@@ -9,9 +9,10 @@
 class BaseLLMClient:
     """A client for making requests to a LLM API e.g Anyscale Endpoints."""
 
-    def __init__(self, model_name: str) -> None:
+    def __init__(self, model_name: str, tokenizer_name: str) -> None:
+        self.model_name = model_name
         self.tokenizer = get_tokenizer(
-            model_name,
+            tokenizer_name,
             trust_remote_code=True,
         )
 

diff --git a/etalon/core/llm_clients/openai_chat_completions_client.py b/etalon/core/llm_clients/openai_chat_completions_client.py
@@ -19,8 +19,8 @@
 class OpenAIChatCompletionsClient(BaseLLMClient):
     """Client for OpenAI Chat Completions API."""
 
-    def __init__(self, model_name: str) -> None:
-        super().__init__(model_name)
+    def __init__(self, model_name: str, tokenizer_name: str) -> None:
+        super().__init__(model_name, tokenizer_name)
         self.client = httpx.AsyncClient()
 
     def total_tokens(self, response_list: List[str]) -> int:

diff --git a/etalon/core/requests_launcher.py b/etalon/core/requests_launcher.py
@@ -13,6 +13,7 @@ class RequestsLauncher:
     def __init__(
         self,
         model: str,
+        tokenizer_name: str,
         llm_api: str,
         num_ray_clients: int,
         num_concurrent_requests_per_client: int,
@@ -23,6 +24,7 @@ def __init__(
                 AsyncRequestsManager.remote(
                     client_id=client_id,
                     model=model,
+                    tokenizer_name=tokenizer_name,
                     llm_api=llm_api,
                     max_concurrent_requests=num_concurrent_requests_per_client,
                 )

diff --git a/etalon/core/requests_manager.py b/etalon/core/requests_manager.py
@@ -15,14 +15,20 @@ class AsyncRequestsManager:
     """Manages requests for single LLM API client."""
 
     def __init__(
-        self, client_id: int, model: str, llm_api: str, max_concurrent_requests: int
+        self,
+        client_id: int,
+        model: str,
+        tokenizer_name: str,
+        llm_api: str,
+        max_concurrent_requests: int,
     ):
         self.max_concurrent_requests = max_concurrent_requests
         self.requests_queue = asyncio.Queue(maxsize=max_concurrent_requests)
         self.results = []
         # just create a single client per manager
         self.llm_client = construct_clients(
             model_name=model,
+            tokenizer_name=tokenizer_name,
             llm_api=llm_api,
             num_clients=1,
             use_ray=False,

diff --git a/etalon/prefill_profiler.py b/etalon/prefill_profiler.py
@@ -81,6 +81,7 @@ def run(self):
             os.makedirs(run_dir, exist_ok=True)
             run_benchmark(
                 model=self.args.model,
+                tokenizer_name=self.args.tokenizer,
                 output_dir=run_dir,
                 additional_sampling_params=self.args.additional_sampling_params,
                 num_ray_clients=PREFILL_NUM_RAY_CLIENTS,

diff --git a/etalon/run_benchmark.py b/etalon/run_benchmark.py
@@ -107,6 +107,7 @@ async def collect_results(
 
 async def run_main_loop(
     model: str,
+    tokenizer_name: str,
     llm_api: str,
     tokenizer: Any,
     additional_sampling_params: Optional[Dict[str, Any]] = None,
@@ -123,6 +124,7 @@ async def run_main_loop(
 ):
     req_launcher = RequestsLauncher(
         model=model,
+        tokenizer_name=tokenizer_name,
         llm_api=llm_api,
         num_ray_clients=num_ray_clients,
         num_concurrent_requests_per_client=num_concurrent_requests_per_client,
@@ -185,6 +187,7 @@ async def run_main_loop(
 
 def run_benchmark(
     model: str,
+    tokenizer_name: str,
     output_dir: str,
     additional_sampling_params: Optional[Dict[str, Any]] = None,
     num_ray_clients: int = 2,
@@ -239,7 +242,7 @@ def run_benchmark(
     )
 
     tokenizer = get_tokenizer(
-        model,
+        tokenizer_name=tokenizer_name,
         trust_remote_code=True,
     )
 
@@ -265,6 +268,7 @@ def run_benchmark(
     asyncio.run(
         run_main_loop(
             model=model,
+            tokenizer_name=tokenizer_name,
             llm_api=llm_api,
             tokenizer=tokenizer,
             additional_sampling_params=additional_sampling_params,
@@ -300,6 +304,12 @@ def parse_args():
     args.add_argument(
         "--model", type=str, required=True, help="The model to use for this load test."
     )
+    args.add_argument(
+        "--tokenizer",
+        type=str,
+        required=False,
+        help="The tokenizer to use for this load test. By default, the tokenizer is inferred from the model.",
+    )
     args.add_argument(
         "--num-ray-clients",
         type=int,
@@ -591,6 +601,9 @@ def parse_args():
 
     args = args.parse_args()
 
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+
     if not args.should_use_given_dir:
         benchmark_identifier = f"{args.model}_{args.request_interval_generator_provider}_{args.request_length_generator_provider}"
         benchmark_identifier = re.sub(r"[^\w\d-]+", "-", benchmark_identifier)
@@ -629,6 +642,7 @@ def parse_args():
         llm_api=args.llm_api,
         output_dir=args.output_dir,
         model=args.model,
+        tokenizer_name=args.tokenizer,
         timeout=args.timeout,
         max_num_completed_requests=args.max_num_completed_requests,
         num_ray_clients=args.num_ray_clients,