Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Docs, Bugfix] Update Logo and Tokenizer Bug Fix #11

Merged
merged 6 commits into from
Aug 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
<!-- <p align="center">
<p align="center">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="docs/_static/logo/dark.png">
<img alt="vLLM" src="docs/_static/logo/light.png" width=50%>
</picture>
</p> -->
</p>

<h3 align="center">
Tool to benchmark LLM Inference Systems
Expand Down
Binary file modified docs/_static/logo/dark.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/_static/logo/light.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
14 changes: 7 additions & 7 deletions etalon/capacity_search/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,20 +33,24 @@ def to_config_dict(self):
class ModelConfig:
name: str
identifier: str
tokenizer: str = None
parallel_specs: List[str] = field(default_factory=list)
traces: List[str] = field(default_factory=list)

def get_key(self):
return f"{self.name}"

def get_human_readable_name(self):
return f"Model: {self.name}"
return f"Model: {self.name}, Tokenizer: {self.tokenizer}"

def to_config_dict(self):
return {"model_name": self.identifier}
return {"model_name": self.identifier, "tokenizer_name": self.tokenizer}

def to_args(self):
return f"--model {self.identifier}"
command = f"--model {self.identifier}"
if self.tokenizer:
command += f" --tokenizer {self.tokenizer}"
return command

def is_parallel_spec_valid(self, spec_name: str) -> bool:
return not self.parallel_specs or spec_name in self.parallel_specs
Expand Down Expand Up @@ -312,10 +316,6 @@ def generate_job_configs(cls, config: dict):
and server_config.openai_server_engine
in ["vllm", "lightllm", "fastertransformers", "sarathi-serve"]
)
or (
model_config.name != "gpt-3.5-turbo"
and server_config.openai_server_engine == "default"
)
or (
request_generator_config.trace_file_name == "sharegpt"
and request_config.request_generator_max_tokens == 16384
Expand Down
2 changes: 1 addition & 1 deletion etalon/core/hf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def get_tokenizer(
)
except TypeError as e:
# The LLaMA tokenizer causes a protobuf error in some environments.
err_msg = "Failed to load the tokenizer."
err_msg = "Failed to load the tokenizer. If model name is correct, consider setting --tokenizer CLI arg to equivalent model on HuggingFace."
raise RuntimeError(err_msg) from e
except ValueError as e:
# If the error pertains to the tokenizer class not existing or not
Expand Down
10 changes: 7 additions & 3 deletions etalon/core/llm_clients/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@


def construct_clients(
model_name: str, llm_api: str, num_clients: int, use_ray: bool = True
model_name: str,
tokenizer_name: str,
llm_api: str,
num_clients: int,
use_ray: bool = True,
) -> List[BaseLLMClient]:
"""Construct LLMClients that will be used to make requests to the LLM API.

Expand All @@ -36,8 +40,8 @@ def construct_clients(
)

if use_ray:
clients = [impl.remote(model_name) for _ in range(num_clients)]
clients = [impl.remote(model_name, tokenizer_name) for _ in range(num_clients)]
else:
clients = [impl(model_name) for _ in range(num_clients)]
clients = [impl(model_name, tokenizer_name) for _ in range(num_clients)]

return clients
5 changes: 3 additions & 2 deletions etalon/core/llm_clients/base_llm_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@
class BaseLLMClient:
"""A client for making requests to a LLM API e.g Anyscale Endpoints."""

def __init__(self, model_name: str) -> None:
def __init__(self, model_name: str, tokenizer_name: str) -> None:
self.model_name = model_name
self.tokenizer = get_tokenizer(
model_name,
tokenizer_name,
trust_remote_code=True,
)

Expand Down
4 changes: 2 additions & 2 deletions etalon/core/llm_clients/openai_chat_completions_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
class OpenAIChatCompletionsClient(BaseLLMClient):
"""Client for OpenAI Chat Completions API."""

def __init__(self, model_name: str) -> None:
super().__init__(model_name)
def __init__(self, model_name: str, tokenizer_name: str) -> None:
super().__init__(model_name, tokenizer_name)
self.client = httpx.AsyncClient()

def total_tokens(self, response_list: List[str]) -> int:
Expand Down
2 changes: 2 additions & 0 deletions etalon/core/requests_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class RequestsLauncher:
def __init__(
self,
model: str,
tokenizer_name: str,
llm_api: str,
num_ray_clients: int,
num_concurrent_requests_per_client: int,
Expand All @@ -23,6 +24,7 @@ def __init__(
AsyncRequestsManager.remote(
client_id=client_id,
model=model,
tokenizer_name=tokenizer_name,
llm_api=llm_api,
max_concurrent_requests=num_concurrent_requests_per_client,
)
Expand Down
8 changes: 7 additions & 1 deletion etalon/core/requests_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,20 @@ class AsyncRequestsManager:
"""Manages requests for single LLM API client."""

def __init__(
self, client_id: int, model: str, llm_api: str, max_concurrent_requests: int
self,
client_id: int,
model: str,
tokenizer_name: str,
llm_api: str,
max_concurrent_requests: int,
):
self.max_concurrent_requests = max_concurrent_requests
self.requests_queue = asyncio.Queue(maxsize=max_concurrent_requests)
self.results = []
# just create a single client per manager
self.llm_client = construct_clients(
model_name=model,
tokenizer_name=tokenizer_name,
llm_api=llm_api,
num_clients=1,
use_ray=False,
Expand Down
1 change: 1 addition & 0 deletions etalon/prefill_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def run(self):
os.makedirs(run_dir, exist_ok=True)
run_benchmark(
model=self.args.model,
tokenizer_name=self.args.tokenizer,
output_dir=run_dir,
additional_sampling_params=self.args.additional_sampling_params,
num_ray_clients=PREFILL_NUM_RAY_CLIENTS,
Expand Down
16 changes: 15 additions & 1 deletion etalon/run_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ async def collect_results(

async def run_main_loop(
model: str,
tokenizer_name: str,
llm_api: str,
tokenizer: Any,
additional_sampling_params: Optional[Dict[str, Any]] = None,
Expand All @@ -123,6 +124,7 @@ async def run_main_loop(
):
req_launcher = RequestsLauncher(
model=model,
tokenizer_name=tokenizer_name,
llm_api=llm_api,
num_ray_clients=num_ray_clients,
num_concurrent_requests_per_client=num_concurrent_requests_per_client,
Expand Down Expand Up @@ -185,6 +187,7 @@ async def run_main_loop(

def run_benchmark(
model: str,
tokenizer_name: str,
output_dir: str,
additional_sampling_params: Optional[Dict[str, Any]] = None,
num_ray_clients: int = 2,
Expand Down Expand Up @@ -239,7 +242,7 @@ def run_benchmark(
)

tokenizer = get_tokenizer(
model,
tokenizer_name=tokenizer_name,
trust_remote_code=True,
)

Expand All @@ -265,6 +268,7 @@ def run_benchmark(
asyncio.run(
run_main_loop(
model=model,
tokenizer_name=tokenizer_name,
llm_api=llm_api,
tokenizer=tokenizer,
additional_sampling_params=additional_sampling_params,
Expand Down Expand Up @@ -300,6 +304,12 @@ def parse_args():
args.add_argument(
"--model", type=str, required=True, help="The model to use for this load test."
)
args.add_argument(
"--tokenizer",
type=str,
required=False,
help="The tokenizer to use for this load test. By default, the tokenizer is inferred from the model.",
)
args.add_argument(
"--num-ray-clients",
type=int,
Expand Down Expand Up @@ -591,6 +601,9 @@ def parse_args():

args = args.parse_args()

if args.tokenizer is None:
args.tokenizer = args.model

if not args.should_use_given_dir:
benchmark_identifier = f"{args.model}_{args.request_interval_generator_provider}_{args.request_length_generator_provider}"
benchmark_identifier = re.sub(r"[^\w\d-]+", "-", benchmark_identifier)
Expand Down Expand Up @@ -629,6 +642,7 @@ def parse_args():
llm_api=args.llm_api,
output_dir=args.output_dir,
model=args.model,
tokenizer_name=args.tokenizer,
timeout=args.timeout,
max_num_completed_requests=args.max_num_completed_requests,
num_ray_clients=args.num_ray_clients,
Expand Down
Loading