diff --git a/README.md b/README.md
index ac5d607..bfe653e 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,9 @@
-
+
Tool to benchmark LLM Inference Systems
diff --git a/docs/_static/logo/dark.png b/docs/_static/logo/dark.png
index 7c458b2..aa3756f 100644
Binary files a/docs/_static/logo/dark.png and b/docs/_static/logo/dark.png differ
diff --git a/docs/_static/logo/light.png b/docs/_static/logo/light.png
index 6cf8076..4330800 100644
Binary files a/docs/_static/logo/light.png and b/docs/_static/logo/light.png differ
diff --git a/etalon/capacity_search/config/config.py b/etalon/capacity_search/config/config.py
index f770249..6b3c620 100644
--- a/etalon/capacity_search/config/config.py
+++ b/etalon/capacity_search/config/config.py
@@ -33,6 +33,7 @@ def to_config_dict(self):
class ModelConfig:
name: str
identifier: str
+ tokenizer: str = None
parallel_specs: List[str] = field(default_factory=list)
traces: List[str] = field(default_factory=list)
@@ -40,13 +41,16 @@ def get_key(self):
return f"{self.name}"
def get_human_readable_name(self):
- return f"Model: {self.name}"
+ return f"Model: {self.name}, Tokenizer: {self.tokenizer}"
def to_config_dict(self):
- return {"model_name": self.identifier}
+ return {"model_name": self.identifier, "tokenizer_name": self.tokenizer}
def to_args(self):
- return f"--model {self.identifier}"
+ command = f"--model {self.identifier}"
+ if self.tokenizer:
+ command += f" --tokenizer {self.tokenizer}"
+ return command
def is_parallel_spec_valid(self, spec_name: str) -> bool:
return not self.parallel_specs or spec_name in self.parallel_specs
@@ -312,10 +316,6 @@ def generate_job_configs(cls, config: dict):
and server_config.openai_server_engine
in ["vllm", "lightllm", "fastertransformers", "sarathi-serve"]
)
- or (
- model_config.name != "gpt-3.5-turbo"
- and server_config.openai_server_engine == "default"
- )
or (
request_generator_config.trace_file_name == "sharegpt"
and request_config.request_generator_max_tokens == 16384
diff --git a/etalon/core/hf_utils.py b/etalon/core/hf_utils.py
index 087f85e..e913a21 100644
--- a/etalon/core/hf_utils.py
+++ b/etalon/core/hf_utils.py
@@ -26,7 +26,7 @@ def get_tokenizer(
)
except TypeError as e:
# The LLaMA tokenizer causes a protobuf error in some environments.
- err_msg = "Failed to load the tokenizer."
+ err_msg = "Failed to load the tokenizer. If model name is correct, consider setting --tokenizer CLI arg to equivalent model on HuggingFace."
raise RuntimeError(err_msg) from e
except ValueError as e:
# If the error pertains to the tokenizer class not existing or not
diff --git a/etalon/core/llm_clients/__init__.py b/etalon/core/llm_clients/__init__.py
index 65c3bfd..7a67c33 100644
--- a/etalon/core/llm_clients/__init__.py
+++ b/etalon/core/llm_clients/__init__.py
@@ -10,7 +10,11 @@
def construct_clients(
- model_name: str, llm_api: str, num_clients: int, use_ray: bool = True
+ model_name: str,
+ tokenizer_name: str,
+ llm_api: str,
+ num_clients: int,
+ use_ray: bool = True,
) -> List[BaseLLMClient]:
"""Construct LLMClients that will be used to make requests to the LLM API.
@@ -36,8 +40,8 @@ def construct_clients(
)
if use_ray:
- clients = [impl.remote(model_name) for _ in range(num_clients)]
+ clients = [impl.remote(model_name, tokenizer_name) for _ in range(num_clients)]
else:
- clients = [impl(model_name) for _ in range(num_clients)]
+ clients = [impl(model_name, tokenizer_name) for _ in range(num_clients)]
return clients
diff --git a/etalon/core/llm_clients/base_llm_client.py b/etalon/core/llm_clients/base_llm_client.py
index 1420226..f63ca8b 100644
--- a/etalon/core/llm_clients/base_llm_client.py
+++ b/etalon/core/llm_clients/base_llm_client.py
@@ -9,9 +9,10 @@
class BaseLLMClient:
"""A client for making requests to a LLM API e.g Anyscale Endpoints."""
- def __init__(self, model_name: str) -> None:
+ def __init__(self, model_name: str, tokenizer_name: str) -> None:
+ self.model_name = model_name
self.tokenizer = get_tokenizer(
- model_name,
+ tokenizer_name,
trust_remote_code=True,
)
diff --git a/etalon/core/llm_clients/openai_chat_completions_client.py b/etalon/core/llm_clients/openai_chat_completions_client.py
index 15a0a8a..71dbe2d 100644
--- a/etalon/core/llm_clients/openai_chat_completions_client.py
+++ b/etalon/core/llm_clients/openai_chat_completions_client.py
@@ -19,8 +19,8 @@
class OpenAIChatCompletionsClient(BaseLLMClient):
"""Client for OpenAI Chat Completions API."""
- def __init__(self, model_name: str) -> None:
- super().__init__(model_name)
+ def __init__(self, model_name: str, tokenizer_name: str) -> None:
+ super().__init__(model_name, tokenizer_name)
self.client = httpx.AsyncClient()
def total_tokens(self, response_list: List[str]) -> int:
diff --git a/etalon/core/requests_launcher.py b/etalon/core/requests_launcher.py
index c349b11..c9b6e6a 100644
--- a/etalon/core/requests_launcher.py
+++ b/etalon/core/requests_launcher.py
@@ -13,6 +13,7 @@ class RequestsLauncher:
def __init__(
self,
model: str,
+ tokenizer_name: str,
llm_api: str,
num_ray_clients: int,
num_concurrent_requests_per_client: int,
@@ -23,6 +24,7 @@ def __init__(
AsyncRequestsManager.remote(
client_id=client_id,
model=model,
+ tokenizer_name=tokenizer_name,
llm_api=llm_api,
max_concurrent_requests=num_concurrent_requests_per_client,
)
diff --git a/etalon/core/requests_manager.py b/etalon/core/requests_manager.py
index 68a1825..2e8890c 100644
--- a/etalon/core/requests_manager.py
+++ b/etalon/core/requests_manager.py
@@ -15,7 +15,12 @@ class AsyncRequestsManager:
"""Manages requests for single LLM API client."""
def __init__(
- self, client_id: int, model: str, llm_api: str, max_concurrent_requests: int
+ self,
+ client_id: int,
+ model: str,
+ tokenizer_name: str,
+ llm_api: str,
+ max_concurrent_requests: int,
):
self.max_concurrent_requests = max_concurrent_requests
self.requests_queue = asyncio.Queue(maxsize=max_concurrent_requests)
@@ -23,6 +28,7 @@ def __init__(
# just create a single client per manager
self.llm_client = construct_clients(
model_name=model,
+ tokenizer_name=tokenizer_name,
llm_api=llm_api,
num_clients=1,
use_ray=False,
diff --git a/etalon/prefill_profiler.py b/etalon/prefill_profiler.py
index aaac198..fc6aa47 100644
--- a/etalon/prefill_profiler.py
+++ b/etalon/prefill_profiler.py
@@ -81,6 +81,7 @@ def run(self):
os.makedirs(run_dir, exist_ok=True)
run_benchmark(
model=self.args.model,
+ tokenizer_name=self.args.tokenizer,
output_dir=run_dir,
additional_sampling_params=self.args.additional_sampling_params,
num_ray_clients=PREFILL_NUM_RAY_CLIENTS,
diff --git a/etalon/run_benchmark.py b/etalon/run_benchmark.py
index bee9a93..0f7aab5 100644
--- a/etalon/run_benchmark.py
+++ b/etalon/run_benchmark.py
@@ -107,6 +107,7 @@ async def collect_results(
async def run_main_loop(
model: str,
+ tokenizer_name: str,
llm_api: str,
tokenizer: Any,
additional_sampling_params: Optional[Dict[str, Any]] = None,
@@ -123,6 +124,7 @@ async def run_main_loop(
):
req_launcher = RequestsLauncher(
model=model,
+ tokenizer_name=tokenizer_name,
llm_api=llm_api,
num_ray_clients=num_ray_clients,
num_concurrent_requests_per_client=num_concurrent_requests_per_client,
@@ -185,6 +187,7 @@ async def run_main_loop(
def run_benchmark(
model: str,
+ tokenizer_name: str,
output_dir: str,
additional_sampling_params: Optional[Dict[str, Any]] = None,
num_ray_clients: int = 2,
@@ -239,7 +242,7 @@ def run_benchmark(
)
tokenizer = get_tokenizer(
- model,
+ tokenizer_name=tokenizer_name,
trust_remote_code=True,
)
@@ -265,6 +268,7 @@ def run_benchmark(
asyncio.run(
run_main_loop(
model=model,
+ tokenizer_name=tokenizer_name,
llm_api=llm_api,
tokenizer=tokenizer,
additional_sampling_params=additional_sampling_params,
@@ -300,6 +304,12 @@ def parse_args():
args.add_argument(
"--model", type=str, required=True, help="The model to use for this load test."
)
+ args.add_argument(
+ "--tokenizer",
+ type=str,
+ required=False,
+ help="The tokenizer to use for this load test. By default, the tokenizer is inferred from the model.",
+ )
args.add_argument(
"--num-ray-clients",
type=int,
@@ -591,6 +601,9 @@ def parse_args():
args = args.parse_args()
+ if args.tokenizer is None:
+ args.tokenizer = args.model
+
if not args.should_use_given_dir:
benchmark_identifier = f"{args.model}_{args.request_interval_generator_provider}_{args.request_length_generator_provider}"
benchmark_identifier = re.sub(r"[^\w\d-]+", "-", benchmark_identifier)
@@ -629,6 +642,7 @@ def parse_args():
llm_api=args.llm_api,
output_dir=args.output_dir,
model=args.model,
+ tokenizer_name=args.tokenizer,
timeout=args.timeout,
max_num_completed_requests=args.max_num_completed_requests,
num_ray_clients=args.num_ray_clients,