diff --git a/README.md b/README.md index ac5d607..bfe653e 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ - +

Tool to benchmark LLM Inference Systems diff --git a/docs/_static/logo/dark.png b/docs/_static/logo/dark.png index 7c458b2..aa3756f 100644 Binary files a/docs/_static/logo/dark.png and b/docs/_static/logo/dark.png differ diff --git a/docs/_static/logo/light.png b/docs/_static/logo/light.png index 6cf8076..4330800 100644 Binary files a/docs/_static/logo/light.png and b/docs/_static/logo/light.png differ diff --git a/etalon/capacity_search/config/config.py b/etalon/capacity_search/config/config.py index f770249..6b3c620 100644 --- a/etalon/capacity_search/config/config.py +++ b/etalon/capacity_search/config/config.py @@ -33,6 +33,7 @@ def to_config_dict(self): class ModelConfig: name: str identifier: str + tokenizer: str = None parallel_specs: List[str] = field(default_factory=list) traces: List[str] = field(default_factory=list) @@ -40,13 +41,16 @@ def get_key(self): return f"{self.name}" def get_human_readable_name(self): - return f"Model: {self.name}" + return f"Model: {self.name}, Tokenizer: {self.tokenizer}" def to_config_dict(self): - return {"model_name": self.identifier} + return {"model_name": self.identifier, "tokenizer_name": self.tokenizer} def to_args(self): - return f"--model {self.identifier}" + command = f"--model {self.identifier}" + if self.tokenizer: + command += f" --tokenizer {self.tokenizer}" + return command def is_parallel_spec_valid(self, spec_name: str) -> bool: return not self.parallel_specs or spec_name in self.parallel_specs @@ -312,10 +316,6 @@ def generate_job_configs(cls, config: dict): and server_config.openai_server_engine in ["vllm", "lightllm", "fastertransformers", "sarathi-serve"] ) - or ( - model_config.name != "gpt-3.5-turbo" - and server_config.openai_server_engine == "default" - ) or ( request_generator_config.trace_file_name == "sharegpt" and request_config.request_generator_max_tokens == 16384 diff --git a/etalon/core/hf_utils.py b/etalon/core/hf_utils.py index 087f85e..e913a21 100644 --- a/etalon/core/hf_utils.py +++ b/etalon/core/hf_utils.py @@ -26,7 +26,7 @@ def get_tokenizer( ) except TypeError as e: # The LLaMA tokenizer causes a protobuf error in some environments. - err_msg = "Failed to load the tokenizer." + err_msg = "Failed to load the tokenizer. If model name is correct, consider setting --tokenizer CLI arg to equivalent model on HuggingFace." raise RuntimeError(err_msg) from e except ValueError as e: # If the error pertains to the tokenizer class not existing or not diff --git a/etalon/core/llm_clients/__init__.py b/etalon/core/llm_clients/__init__.py index 65c3bfd..7a67c33 100644 --- a/etalon/core/llm_clients/__init__.py +++ b/etalon/core/llm_clients/__init__.py @@ -10,7 +10,11 @@ def construct_clients( - model_name: str, llm_api: str, num_clients: int, use_ray: bool = True + model_name: str, + tokenizer_name: str, + llm_api: str, + num_clients: int, + use_ray: bool = True, ) -> List[BaseLLMClient]: """Construct LLMClients that will be used to make requests to the LLM API. @@ -36,8 +40,8 @@ def construct_clients( ) if use_ray: - clients = [impl.remote(model_name) for _ in range(num_clients)] + clients = [impl.remote(model_name, tokenizer_name) for _ in range(num_clients)] else: - clients = [impl(model_name) for _ in range(num_clients)] + clients = [impl(model_name, tokenizer_name) for _ in range(num_clients)] return clients diff --git a/etalon/core/llm_clients/base_llm_client.py b/etalon/core/llm_clients/base_llm_client.py index 1420226..f63ca8b 100644 --- a/etalon/core/llm_clients/base_llm_client.py +++ b/etalon/core/llm_clients/base_llm_client.py @@ -9,9 +9,10 @@ class BaseLLMClient: """A client for making requests to a LLM API e.g Anyscale Endpoints.""" - def __init__(self, model_name: str) -> None: + def __init__(self, model_name: str, tokenizer_name: str) -> None: + self.model_name = model_name self.tokenizer = get_tokenizer( - model_name, + tokenizer_name, trust_remote_code=True, ) diff --git a/etalon/core/llm_clients/openai_chat_completions_client.py b/etalon/core/llm_clients/openai_chat_completions_client.py index 15a0a8a..71dbe2d 100644 --- a/etalon/core/llm_clients/openai_chat_completions_client.py +++ b/etalon/core/llm_clients/openai_chat_completions_client.py @@ -19,8 +19,8 @@ class OpenAIChatCompletionsClient(BaseLLMClient): """Client for OpenAI Chat Completions API.""" - def __init__(self, model_name: str) -> None: - super().__init__(model_name) + def __init__(self, model_name: str, tokenizer_name: str) -> None: + super().__init__(model_name, tokenizer_name) self.client = httpx.AsyncClient() def total_tokens(self, response_list: List[str]) -> int: diff --git a/etalon/core/requests_launcher.py b/etalon/core/requests_launcher.py index c349b11..c9b6e6a 100644 --- a/etalon/core/requests_launcher.py +++ b/etalon/core/requests_launcher.py @@ -13,6 +13,7 @@ class RequestsLauncher: def __init__( self, model: str, + tokenizer_name: str, llm_api: str, num_ray_clients: int, num_concurrent_requests_per_client: int, @@ -23,6 +24,7 @@ def __init__( AsyncRequestsManager.remote( client_id=client_id, model=model, + tokenizer_name=tokenizer_name, llm_api=llm_api, max_concurrent_requests=num_concurrent_requests_per_client, ) diff --git a/etalon/core/requests_manager.py b/etalon/core/requests_manager.py index 68a1825..2e8890c 100644 --- a/etalon/core/requests_manager.py +++ b/etalon/core/requests_manager.py @@ -15,7 +15,12 @@ class AsyncRequestsManager: """Manages requests for single LLM API client.""" def __init__( - self, client_id: int, model: str, llm_api: str, max_concurrent_requests: int + self, + client_id: int, + model: str, + tokenizer_name: str, + llm_api: str, + max_concurrent_requests: int, ): self.max_concurrent_requests = max_concurrent_requests self.requests_queue = asyncio.Queue(maxsize=max_concurrent_requests) @@ -23,6 +28,7 @@ def __init__( # just create a single client per manager self.llm_client = construct_clients( model_name=model, + tokenizer_name=tokenizer_name, llm_api=llm_api, num_clients=1, use_ray=False, diff --git a/etalon/prefill_profiler.py b/etalon/prefill_profiler.py index aaac198..fc6aa47 100644 --- a/etalon/prefill_profiler.py +++ b/etalon/prefill_profiler.py @@ -81,6 +81,7 @@ def run(self): os.makedirs(run_dir, exist_ok=True) run_benchmark( model=self.args.model, + tokenizer_name=self.args.tokenizer, output_dir=run_dir, additional_sampling_params=self.args.additional_sampling_params, num_ray_clients=PREFILL_NUM_RAY_CLIENTS, diff --git a/etalon/run_benchmark.py b/etalon/run_benchmark.py index bee9a93..0f7aab5 100644 --- a/etalon/run_benchmark.py +++ b/etalon/run_benchmark.py @@ -107,6 +107,7 @@ async def collect_results( async def run_main_loop( model: str, + tokenizer_name: str, llm_api: str, tokenizer: Any, additional_sampling_params: Optional[Dict[str, Any]] = None, @@ -123,6 +124,7 @@ async def run_main_loop( ): req_launcher = RequestsLauncher( model=model, + tokenizer_name=tokenizer_name, llm_api=llm_api, num_ray_clients=num_ray_clients, num_concurrent_requests_per_client=num_concurrent_requests_per_client, @@ -185,6 +187,7 @@ async def run_main_loop( def run_benchmark( model: str, + tokenizer_name: str, output_dir: str, additional_sampling_params: Optional[Dict[str, Any]] = None, num_ray_clients: int = 2, @@ -239,7 +242,7 @@ def run_benchmark( ) tokenizer = get_tokenizer( - model, + tokenizer_name=tokenizer_name, trust_remote_code=True, ) @@ -265,6 +268,7 @@ def run_benchmark( asyncio.run( run_main_loop( model=model, + tokenizer_name=tokenizer_name, llm_api=llm_api, tokenizer=tokenizer, additional_sampling_params=additional_sampling_params, @@ -300,6 +304,12 @@ def parse_args(): args.add_argument( "--model", type=str, required=True, help="The model to use for this load test." ) + args.add_argument( + "--tokenizer", + type=str, + required=False, + help="The tokenizer to use for this load test. By default, the tokenizer is inferred from the model.", + ) args.add_argument( "--num-ray-clients", type=int, @@ -591,6 +601,9 @@ def parse_args(): args = args.parse_args() + if args.tokenizer is None: + args.tokenizer = args.model + if not args.should_use_given_dir: benchmark_identifier = f"{args.model}_{args.request_interval_generator_provider}_{args.request_length_generator_provider}" benchmark_identifier = re.sub(r"[^\w\d-]+", "-", benchmark_identifier) @@ -629,6 +642,7 @@ def parse_args(): llm_api=args.llm_api, output_dir=args.output_dir, model=args.model, + tokenizer_name=args.tokenizer, timeout=args.timeout, max_num_completed_requests=args.max_num_completed_requests, num_ray_clients=args.num_ray_clients,