diff --git a/.github/workflows/workflow_orders_nightly.yml b/.github/workflows/workflow_orders_nightly.yml
index cd6ab1294..dc9f008cf 100644
--- a/.github/workflows/workflow_orders_nightly.yml
+++ b/.github/workflows/workflow_orders_nightly.yml
@@ -20,6 +20,11 @@ jobs:
uses: ./.github/workflows/workflow_finetune.yml
with:
ci_type: nightly
+
+ call-benchmark:
+ uses: ./.github/workflows/workflow_test_benchmark.yml
+ with:
+ ci_type: nightly
# call-finetune-on-intel-gpu:
# uses: ./.github/workflows/workflow_finetune_gpu.yml
diff --git a/.github/workflows/workflow_orders_on_merge.yml b/.github/workflows/workflow_orders_on_merge.yml
index 69d72a7f8..0ca6f4410 100644
--- a/.github/workflows/workflow_orders_on_merge.yml
+++ b/.github/workflows/workflow_orders_on_merge.yml
@@ -27,3 +27,7 @@ jobs:
Finetune:
needs: Lint
uses: ./.github/workflows/workflow_finetune.yml
+
+ Benchmark:
+ needs: Lint
+ uses: ./.github/workflows/workflow_test_benchmark.yml
diff --git a/.github/workflows/workflow_orders_on_pr.yml b/.github/workflows/workflow_orders_on_pr.yml
index acec72a6d..9c9ade519 100644
--- a/.github/workflows/workflow_orders_on_pr.yml
+++ b/.github/workflows/workflow_orders_on_pr.yml
@@ -27,3 +27,7 @@ jobs:
Finetune:
needs: Lint
uses: ./.github/workflows/workflow_finetune.yml
+
+ Benchmark:
+ needs: Lint
+ uses: ./.github/workflows/workflow_test_benchmark.yml
diff --git a/.github/workflows/workflow_test_benchmark.yml b/.github/workflows/workflow_test_benchmark.yml
new file mode 100644
index 000000000..2f78c997d
--- /dev/null
+++ b/.github/workflows/workflow_test_benchmark.yml
@@ -0,0 +1,127 @@
+name: Benchmark
+
+on:
+ workflow_call:
+ inputs:
+ ci_type:
+ type: string
+ default: 'pr'
+ runner_container_image:
+ type: string
+ default: '10.1.2.13:5000/llmray-build'
+ http_proxy:
+ type: string
+ default: 'http://10.24.221.169:911'
+ https_proxy:
+ type: string
+ default: 'http://10.24.221.169:911'
+ runner_config_path:
+ type: string
+ default: '/home/ci/llm-ray-actions-runner'
+ code_checkout_path:
+ type: string
+ default: '/home/ci/actions-runner/_work/llm-on-ray/llm-on-ray'
+ model_cache_path:
+ type: string
+ default: '/mnt/DP_disk1/huggingface/cache'
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-bench
+ cancel-in-progress: true
+
+jobs:
+ setup-test:
+
+ name: benchmark
+
+ runs-on: self-hosted
+
+ defaults:
+ run:
+ shell: bash
+ container:
+ image: ${{ inputs.runner_container_image }}
+ env:
+ http_proxy: ${{ inputs.http_proxy }}
+ https_proxy: ${{ inputs.https_proxy }}
+ SHELL: bash -eo pipefail
+ volumes:
+ - /var/run/docker.sock:/var/run/docker.sock
+ - ${{ inputs.runner_config_path }}:/root/actions-runner-config
+
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v4
+
+ - name: Load environment variables
+ run: cat /root/actions-runner-config/.env >> $GITHUB_ENV
+
+ - name: Determine Target
+ id: "target"
+ run: |
+ target="benchmark"
+ target="${target}_vllm"
+ echo "target is ${target}"
+ echo "target=$target" >> $GITHUB_OUTPUT
+
+ - name: Build Docker Image
+ run: |
+ DF_SUFFIX=".vllm"
+ TARGET=${{steps.target.outputs.target}}
+ docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ inputs.http_proxy }} --build-arg https_proxy=${{ inputs.https_proxy }} -f dev/docker/Dockerfile${DF_SUFFIX} -t ${TARGET}:latest
+ docker container prune -f
+ docker image prune -f
+
+ - name: Start Docker Container
+ run: |
+ TARGET=${{steps.target.outputs.target}}
+ cid=$(docker ps -q --filter "name=${TARGET}")
+ if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
+ # check and remove exited container
+ cid=$(docker ps -a -q --filter "name=${TARGET}")
+ if [[ ! -z "$cid" ]]; then docker rm $cid; fi
+ docker run -tid -v ${{ inputs.model_cache_path }}:/root/.cache/huggingface/hub -v ${{ inputs.code_checkout_path }}:/root/llm-on-ray -e http_proxy=${{ inputs.http_proxy }} -e https_proxy=${{ inputs.https_proxy }} --name="${TARGET}" --hostname="${TARGET}-container" ${TARGET}:latest
+
+ - name: Start Ray Cluster
+ run: |
+ TARGET=${{steps.target.outputs.target}}
+ docker exec "${TARGET}" bash -c "./dev/scripts/start-ray-cluster.sh"
+
+ - name: Run Benchmark Test
+ run: |
+ TARGET=${{steps.target.outputs.target}}
+ # Additional libraries required for pytest
+ docker exec "${TARGET}" bash -c "pip install -r tests/requirements.txt"
+ CMD=$(cat << EOF
+ import yaml
+ conf_path = "llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml"
+ with open(conf_path, encoding="utf-8") as reader:
+ result = yaml.load(reader, Loader=yaml.FullLoader)
+ result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
+ with open(conf_path, 'w') as output:
+ yaml.dump(result, output, sort_keys=False)
+ conf_path = "llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml"
+ with open(conf_path, encoding="utf-8") as reader:
+ result = yaml.load(reader, Loader=yaml.FullLoader)
+ result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
+ with open(conf_path, 'w') as output:
+ yaml.dump(result, output, sort_keys=False)
+ EOF
+ )
+ docker exec "${TARGET}" python -c "$CMD"
+ docker exec "${TARGET}" bash -c "huggingface-cli login --token ${{ env.HF_ACCESS_TOKEN }}"
+ docker exec "${TARGET}" bash -c "./tests/run-tests-benchmark.sh"
+ - name: Stop Ray
+ run: |
+ TARGET=${{steps.target.outputs.target}}
+ cid=$(docker ps -q --filter "name=${TARGET}")
+ if [[ ! -z "$cid" ]]; then
+ docker exec "${TARGET}" bash -c "ray stop"
+ fi
+
+ - name: Stop Container
+ if: success() || failure()
+ run: |
+ TARGET=${{steps.target.outputs.target}}
+ cid=$(docker ps -q --filter "name=${TARGET}")
+ if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
diff --git a/README.md b/README.md
index fe276d955..c0967ab34 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ LLM-on-Ray's modular workflow structure is designed to comprehensively cater to
* **Interactive Web UI for Enhanced Usability**: Except for command line, LLM-on-Ray introduces a Web UI, allowing users to easily finetune and deploy LLMs through a user-friendly interface. Additionally, the UI includes a chatbot application, enabling users to immediately test and refine the models.
-![llm-on-ray](https://github.com/intel/llm-on-ray/assets/9278199/68017c14-c0be-4b91-8d71-4b74ab89bd81)
+![llm-on-ray](./docs/assets/solution_technical_overview.png)
## Getting Started
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index f82c934cd..2a2a8570a 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -477,6 +477,7 @@ def main(args: argparse.Namespace):
config["top_p"] = float(args.top_p)
if args.top_k:
config["top_k"] = float(args.top_k)
+ config["do_sample"] = args.do_sample
# In order to align with vllm test parameters
if args.vllm_engine:
config["ignore_eos"] = True
@@ -734,6 +735,11 @@ def main(args: argparse.Namespace):
help="The number of highest probability vocabulary tokens to keep \
for top-k-filtering.",
)
+ parser.add_argument(
+ "--do_sample",
+ action="store_true",
+ help="Whether or not to use sampling; use greedy decoding otherwise.",
+ )
parser.add_argument(
"--vllm-engine",
action="store_true",
diff --git a/benchmarks/benchmark_visualize.py b/benchmarks/benchmark_visualize.py
new file mode 100644
index 000000000..1d75f3fc4
--- /dev/null
+++ b/benchmarks/benchmark_visualize.py
@@ -0,0 +1,336 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import matplotlib.pyplot as plt
+import string
+import json
+import os
+
+marks = {}
+marks["prompts_num_mark"] = r"num_prompts"
+marks["total_time_mark"] = r"benchmark_time"
+marks["min_prompt_len_mark"] = r"min_prompt_len"
+marks["med_prompt_len_mark"] = r"med_prompt_len"
+marks["max_prompt_len_mark"] = r"max_prompt_len"
+marks["request_throughput_mark"] = r"throughput_requests_per_s"
+marks["input_token_throughput_mark"] = r"input_throughput_tokens_per_s"
+marks["output_token_throughput_mark"] = r"output_throughput_tokens_per_s"
+marks["latency_per_req_mark"] = r"avg_latency"
+marks["latency_per_token_mark"] = r"avg_per_token_latency"
+marks["latency_first_token_mark"] = r"avg_first_token_latency"
+marks["latency_next_token_mark"] = r"avg_next_token_latency"
+
+
+def get_avg_metric(metric, num_iter, per_iter_len):
+ avg_metric = []
+ for i in range(0, per_iter_len):
+ index = i
+ average = 0
+ num = 0
+ while num < num_iter:
+ average += metric[num * per_iter_len + index]
+ num += 1
+ avg_metric.append(average / num_iter)
+ return avg_metric
+
+
+def get_title_label(mark_name):
+ title = marks[mark_name].strip(string.punctuation)
+ label = title
+ if "throughput" in label:
+ label += " (tokens/s)"
+ elif "latency" in label:
+ label += " (s)"
+ return title, label
+
+
+def plot_compare_metric(bs, metric_vllm, metric_llmonray, mark_name, save_path):
+ plt.plot(bs, metric_vllm, color="red", label="VLLM")
+ plt.plot(bs, metric_llmonray, label="without VLLM")
+ plt.xticks(bs)
+
+ plt.xlabel("bs")
+ title, label = get_title_label(mark_name)
+ plt.ylabel(label)
+ plt.legend()
+ plt.title(title)
+
+ plt.savefig(save_path)
+ plt.close()
+ print("generated successfully")
+
+
+def plot_vllm_peak_throughput(bs, output_Token_Throughput, mark_name, save_path):
+ plt.plot(bs, output_Token_Throughput, color="red", label="VLLM")
+ plt.xticks(bs)
+ plt.xlabel("bs")
+ title, label = get_title_label(mark_name)
+ plt.ylabel(label)
+ plt.legend()
+
+ plt.title(title)
+ plt.savefig(save_path)
+ plt.close()
+ print("generated successfully")
+
+
+def plot_latency_throughput(concurrency, num_replica, latency, throughput, save_path):
+ fig, ax1 = plt.subplots()
+
+ for i in range(len(latency)):
+ latency[i] *= 1000
+ ax1.plot(latency, throughput, color="tab:blue")
+ mark = []
+ for i in concurrency:
+ mark.append(f"bs={int(i/num_replica)}")
+ for i in range(len(mark)):
+ plt.text(latency[i], throughput[i], mark[i])
+ ax1.set_xlabel("Average latency for Next Tokens (ms)")
+ ax1.set_ylabel("Output Tokens Throughput (tokens/sec)", color="tab:blue")
+ ax1.tick_params(axis="y", labelcolor="tab:blue")
+
+ plt.title("Values of Throughput and Next Token Latency corresponding to different bs")
+ plt.savefig(save_path)
+ plt.close()
+ print("generated successfully")
+
+
+def extract_metric_choice_1_2(bs_dirs, mark_name):
+ bs = []
+ metric_value = []
+ bs_listdir = os.listdir(bs_dirs)
+ bs_listdir.sort(key=lambda x: (len(x), x))
+ for bs_dir in bs_listdir:
+ bs.append(int(bs_dir.split("_")[-1]))
+ res_path = os.path.join(bs_dirs, bs_dir)
+ # get the latest summary log file
+ res_listdir = os.listdir(res_path)
+ res_listdir = [res_i for res_i in res_listdir if "summary" in res_i]
+ res_listdir.sort(key=lambda x: (len(x), x))
+ log_file = os.path.join(res_path, res_listdir[-1])
+ with open(log_file) as f:
+ log_content = json.load(f)
+ metric_value.append(float(log_content[marks[mark_name]]))
+ return bs, metric_value
+
+
+def extract_metric_choice_3(iter_dirs):
+ iters = []
+ num_prompts = []
+ latency_next_token = []
+ output_throughput = []
+ iter_listdir = os.listdir(iter_dirs)
+ iter_listdir.sort(key=lambda x: (len(x), x))
+ for iter_dir in iter_listdir:
+ prompt_dirs = os.path.join(iter_dirs, iter_dir)
+ iters.append(int(iter_dir.split("_")[-1]))
+ prompt_listdir = os.listdir(prompt_dirs)
+ prompt_listdir.sort(key=lambda x: (len(x), x))
+ for prompt_dir in prompt_listdir:
+ num_prompts.append(int(prompt_dir.split("_")[-1]))
+ res_path = os.path.join(prompt_dirs, prompt_dir)
+ # get the latest summary log file
+ res_listdir = os.listdir(res_path)
+ res_listdir = [res_i for res_i in res_listdir if "summary" in res_i]
+ res_listdir.sort(key=lambda x: (len(x), x))
+ log_file = os.path.join(res_path, res_listdir[-1])
+ with open(log_file) as f:
+ log_content = json.load(f)
+ latency_next_token.append(float(log_content[marks["latency_next_token_mark"]]))
+ output_throughput.append(float(log_content[marks["output_token_throughput_mark"]]))
+ return iters, num_prompts, latency_next_token, output_throughput
+
+
+def extract_metric_choice_4(iter_dirs):
+ iters = []
+ input_tokens_length_li = []
+ latency_first_token = []
+ latency_next_token = []
+ iter_listdir = os.listdir(iter_dirs)
+ iter_listdir.sort(key=lambda x: (len(x), x))
+ for iter_dir in iter_listdir:
+ iters.append(int(iter_dir.split("_")[-1]))
+ token_dirs = os.path.join(iter_dirs, iter_dir)
+ for token_dir in os.listdir(token_dirs):
+ input_tokens_length_li.append(int(token_dir.split("_")[-2]))
+ res_path = os.path.join(token_dirs, token_dir)
+ # get the latest summary log file
+ res_listdir = os.listdir(res_path)
+ res_listdir = [res_i for res_i in res_listdir if "summary" in res_i]
+ res_listdir.sort(key=lambda x: (len(x), x))
+ log_file = os.path.join(res_path, res_listdir[-1])
+ with open(log_file) as f:
+ log_content = json.load(f)
+ latency_first_token.append(float(log_content[marks["latency_first_token_mark"]]))
+ latency_next_token.append(float(log_content[marks["latency_next_token_mark"]]))
+
+ return iters, input_tokens_length_li, latency_first_token, latency_next_token
+
+
+def main(args):
+ choice = args.choice
+ benchmark_dir = args.benchmark_dir
+ save_dir = args.save_dir
+ current_path = os.path.dirname(os.path.abspath(__file__))
+ if benchmark_dir is None:
+ if args.run_mode == "benchmark":
+ benchmark_dir = os.path.join(current_path, "results")
+ elif args.run_mode == "test":
+ benchmark_dir = os.path.join(current_path, "results_test")
+ else:
+ print(
+ f"Invalid run_mode, expected value 'test' or 'benchmark', but got {args.run_mode}."
+ )
+ exit(1)
+ if save_dir is None:
+ save_dir = os.path.join(current_path, "figures")
+ if not os.path.exists(save_dir):
+ os.makedirs(save_dir)
+ if 1 in choice:
+ # get the peak output throughput of llm-on-ray with vllm
+ print("draw the output token throughput of llm-on-ray with vllm")
+ choice_dir = os.path.join(benchmark_dir, "choice_1")
+
+ mark_name = "output_token_throughput_mark"
+ bs, output_Token_Throughput = extract_metric_choice_1_2(choice_dir, mark_name)
+ print("bs: ", bs)
+ print("output_Token_throughput: ", output_Token_Throughput)
+
+ save_figure_path = os.path.join(save_dir, "choice1_vllm_peak_throughput.png")
+ plot_vllm_peak_throughput(bs, output_Token_Throughput, mark_name, save_figure_path)
+ if 2 in choice:
+ # compare output token throughput(average latency per token) between llm-on-ray with vllm and llm-on-ray
+ print("draw vllm vs llmonray(output token throughput/average latency per token)")
+ choice_dir = os.path.join(benchmark_dir, "choice_2")
+ vllm_dir = os.path.join(choice_dir, "vllm")
+ wo_vllm_dir = os.path.join(choice_dir, "wo_vllm")
+
+ mark_name = "output_token_throughput_mark"
+ bs, vllm_output_Token_Throughput = extract_metric_choice_1_2(vllm_dir, mark_name)
+ _, wo_vllm_output_Token_Throughput = extract_metric_choice_1_2(wo_vllm_dir, mark_name)
+ print("bs: ", bs)
+ print("vllm_output_Token_Throughput: ", vllm_output_Token_Throughput)
+ print("wo_vllm_output_Token_Throughput: ", wo_vllm_output_Token_Throughput)
+
+ save_figure_path = os.path.join(save_dir, "choice2_output_token_throughput_compare.png")
+ plot_compare_metric(
+ bs,
+ vllm_output_Token_Throughput,
+ wo_vllm_output_Token_Throughput,
+ mark_name,
+ save_figure_path,
+ )
+
+ mark_name = "latency_per_token_mark"
+ save_figure_path = os.path.join(save_dir, "choice2_average_latency_per_token_compare.png")
+ bs, vllm_average_latency_per_token = extract_metric_choice_1_2(vllm_dir, mark_name)
+ _, wo_vllm_average_latency_per_token = extract_metric_choice_1_2(wo_vllm_dir, mark_name)
+ print("vllm_average_latency_per_token: ", vllm_average_latency_per_token)
+ print("wo_vllm_average_latency_per_token: ", wo_vllm_average_latency_per_token)
+ plot_compare_metric(
+ bs,
+ vllm_average_latency_per_token,
+ wo_vllm_average_latency_per_token,
+ mark_name,
+ save_figure_path,
+ )
+ if 3 in choice:
+ # latency vs throughput tradeoff for various number of requests
+ choice_dir = os.path.join(benchmark_dir, "choice_3")
+ token_dirs = os.listdir(choice_dir)
+ print("draw latency vs throughput tradeoff for various number of requests")
+ for token_res in token_dirs:
+ iter_dirs = os.path.join(choice_dir, token_res)
+ save_figure_path = os.path.join(save_dir, "choice3_" + token_res)
+ iters, num_prompts, latency_next_token, output_throughput = extract_metric_choice_3(
+ iter_dirs
+ )
+
+ print("iter: ", iters)
+ print("num prompt: ", num_prompts)
+ print("latency_next_token: ", latency_next_token)
+ print("output_throughput: ", output_throughput)
+ num_iter = len(iters)
+ per_iter_len = int(len(num_prompts) / num_iter)
+ avg_latency_next_token = get_avg_metric(latency_next_token, num_iter, per_iter_len)
+ avg_output_throughput = get_avg_metric(output_throughput, num_iter, per_iter_len)
+ print("avg_latency_next_token: ", avg_latency_next_token)
+ print("avg_output_throughput: ", avg_output_throughput)
+ plot_latency_throughput(
+ num_prompts[:per_iter_len],
+ args.num_replica,
+ avg_latency_next_token,
+ avg_output_throughput,
+ save_figure_path,
+ )
+ if 4 in choice:
+ # get the latency of llm-on-Ray with vllm
+ choice_dir = os.path.join(benchmark_dir, "choice_4")
+ print("get the latency of llm-on-ray with vllm")
+ (
+ iters,
+ input_tokens_length_li,
+ latency_first_token,
+ latency_next_token,
+ ) = extract_metric_choice_4(choice_dir)
+ print("iter: ", iters)
+ print("input_tokens_length: ", input_tokens_length_li)
+ print("latency_first_token: ", latency_first_token)
+ print("latency_next_token: ", latency_next_token)
+ num_iter = len(iters)
+ per_iter_len = int(len(input_tokens_length_li) / num_iter)
+ avg_latency_first_token = get_avg_metric(latency_first_token, num_iter, per_iter_len)
+ avg_latency_next_token = get_avg_metric(latency_next_token, num_iter, per_iter_len)
+ print("Results: ")
+ print(f"input_tokens_length: {input_tokens_length_li[:per_iter_len]}")
+ print(f"avg_latency_first_token: {avg_latency_first_token}")
+ print(f"avg_latency_next_token: {avg_latency_next_token}")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="Benchmark the online serving latency and throughput."
+ )
+ parser.add_argument(
+ "--choice",
+ nargs="*",
+ default=[1, 2, 3, 4],
+ type=int,
+ help="Which type of figure to draw. [1: the peak throughput of llm-on-ray, 2: llm-on-ray with vllm vs llm-on-ray, 3: latency_throughput, 4: get the latecy of llm-on-ray with vllm]",
+ )
+ parser.add_argument(
+ "--num-replica",
+ default=1,
+ type=int,
+ help="The number of replicas that respond to requests at the same time.",
+ )
+ parser.add_argument(
+ "--run-mode",
+ default="benchmark",
+ type=str,
+ help="Which run mode is used to generate the results, benchmark or test?",
+ )
+ parser.add_argument(
+ "--benchmark-dir",
+ default=None,
+ type=str,
+ help="The directory of benchmark results.",
+ )
+ parser.add_argument("--save-dir", default=None, type=str, help="The directory to save figures.")
+
+ args = parser.parse_args()
+ main(args)
diff --git a/benchmarks/run_benchmark.sh b/benchmarks/run_benchmark.sh
new file mode 100644
index 000000000..188f5c415
--- /dev/null
+++ b/benchmarks/run_benchmark.sh
@@ -0,0 +1,232 @@
+#! /bin/bash
+set -eo pipefail
+
+CHOICE=${1}
+RUN_MODE=${2} # "test" or "benchmark", where "test" will only use a small part of the dataset
+if [ -z "$CHOICE" ]
+then
+ echo "Please pass in the value of parameter CHOICE, which can be any subset of 1,2,3,4."
+fi
+if [ -z "$RUN_MODE" ]
+then
+ echo "Please pass in the value of parameter RUN_MODE, which can be 'test' or 'benchmark'."
+fi
+VALUE_INF=2000
+MODEL_ENDPOINT="http://localhost:8000/llama-2-7b-chat-hf"
+MODEL_NAME="llama-2-7b-chat-hf"
+SHELL_FOLDER=$(cd "$(dirname "$0")";pwd)
+BENCHMARK_SCRIPT=$SHELL_FOLDER"/benchmark_serving.py"
+WITH_VLLM_CONFIG_FILE=$SHELL_FOLDER"/../llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml"
+WO_VLLM_CONFIG_FILE=$SHELL_FOLDER"/../llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml"
+DATASET_PATH=$SHELL_FOLDER"/../dataset"
+DATASET_SHAREGPT_PATH=$SHELL_FOLDER"/../dataset/ShareGPT_V3_unfiltered_cleaned_split.json"
+DATASET_IPEX_PATH=$SHELL_FOLDER"/../dataset/prompt.json"
+DATASET_BENCHMARK_NUM=1000
+DATASET_COMPARE_NUM=128
+NUMA_SERVER_COMMAND=""
+NUM_REPLICA=4
+if [ ! -f $DATASET_SHAREGPT_PATH ]
+then
+ echo "Dataset $DATASET_SHAREGPT_PATH not found, download ShareGPT dataset first."
+ wget -q -P $DATASET_PATH https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+fi
+if [ ! -f $DATASET_IPEX_PATH ]
+then
+ echo "Dataset $DATASET_IPEX_PATH not found, download IPEX dataset first."
+ wget -q -P $DATASET_PATH https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prompt.json
+fi
+if [ $RUN_MODE = "test" ]
+then
+ SAVE_DIR=$SHELL_FOLDER"/results_test"
+ NUMA_CLIENT_COMMAND=""
+elif [ $RUN_MODE = "benchmark" ]
+then
+ SAVE_DIR=$SHELL_FOLDER"/results"
+ NUMA_CLIENT_COMMAND="numactl -N 1 -m 1"
+else
+ echo "Invalid RUN_MODE, expected value 'test' or 'benchmark', but got '$RUN_MODE'."
+ exit 1
+
+fi
+
+get_peak_throughpt(){
+ echo "get performance results of llm-on-ray with vllm based on different bs"
+ bs=${1}
+ echo "batch_size: $bs"
+ num_prompts=${2}
+ choice_dir=${3}
+ for vllm_bs in ${bs}
+ do
+ bs_dir_vllm=$choice_dir"/bs_"$vllm_bs
+ echo "RUN llm-on-ray with vllm"
+ echo "RUN bs ${vllm_bs}"
+ # server:
+ $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_concurrent_queries $VALUE_INF --vllm_max_num_seqs $vllm_bs
+ # client:
+ $NUMA_CLIENT_COMMAND python $BENCHMARK_SCRIPT --model-endpoint-base $MODEL_ENDPOINT --model-name $MODEL_NAME --dataset $DATASET_SHAREGPT_PATH --num-prompts $num_prompts --dataset-format ShareGPT --vllm-engine --simple --results-dir $bs_dir_vllm
+ done
+ echo "CHOICE 1 generation completed"
+}
+
+metric_bs(){
+ echo "get performance results of llm-on-ray with vllm and llm-on-ray based on different bs"
+ bs=${1}
+ num_prompts=${2}
+ choice_dir_vllm=${3}
+ choice_dir_wo_vllm=${4}
+ for vllm_bs in ${bs}
+ do
+ bs_dir_vllm=$choice_dir_vllm"/bs_"$vllm_bs
+ echo "RUN llm-on-ray with vllm"
+ echo "RUN bs ${vllm_bs}"
+ # server:
+ $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_concurrent_queries $VALUE_INF --vllm_max_num_seqs $vllm_bs
+ # client:
+ $NUMA_CLIENT_COMMAND python $BENCHMARK_SCRIPT --model-endpoint-base $MODEL_ENDPOINT --model-name $MODEL_NAME --dataset $DATASET_SHAREGPT_PATH --num-prompts $num_prompts --dataset-format ShareGPT --vllm-engine --simple --results-dir $bs_dir_vllm
+ done
+ for wo_vllm_bs in ${bs}
+ do
+ echo "RUN llm-on-ray"
+ echo "RUN bs ${wo_vllm_bs}"
+ bs_dir_wo_vllm=$choice_dir_wo_vllm"/bs_"$wo_vllm_bs
+ # server:
+ $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WO_VLLM_CONFIG_FILE --simple --max_concurrent_queries $wo_vllm_bs
+ # client:
+ $NUMA_CLIENT_COMMAND python $BENCHMARK_SCRIPT --model-endpoint-base $MODEL_ENDPOINT --model-name $MODEL_NAME --dataset $DATASET_SHAREGPT_PATH --num-prompts $num_prompts --dataset-format ShareGPT --simple --results-dir $bs_dir_wo_vllm
+ done
+ echo "CHOICE 2 generation completed"
+}
+
+latency_throughput(){
+ echo "get performance results of llm-on-ray with vllm when responding different sizes of requests"
+ num_iter=${1}
+ query_num=${2}
+ input_tokens_length=${3}
+ output_tokens_length=${4}
+ choice_dir=${5}
+ tokens_dir=$choice_dir"/tokens_"$input_tokens_length"_"$output_tokens_length
+
+ # server
+ $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_concurrent_queries $VALUE_INF --vllm_max_num_seqs $VALUE_INF
+
+ # client
+ for i in $(seq 1 $num_iter)
+ do
+ echo "Run iter $i"
+ iter_dir=$tokens_dir"/iter_"$i
+ for num_prompts in ${query_num}
+ do
+ results_dir=$iter_dir"/num_prompts_"$num_prompts
+ echo "Run num_prompts ${num_prompts}"
+ echo "results_dir: ${results_dir}"
+ $NUMA_CLIENT_COMMAND python $BENCHMARK_SCRIPT --model-endpoint-base $MODEL_ENDPOINT --model-name $MODEL_NAME --dataset $DATASET_IPEX_PATH --num-prompts $num_prompts --dataset-format IPEX --input-tokens $input_tokens_length --max-new-tokens $output_tokens_length --track-token-latency --vllm-engine --simple --results-dir $results_dir
+ done
+ done
+ echo "CHOICE 3 generation completed"
+}
+
+get_best_latency(){
+ echo "get performance results of llm-on-ray with vllm when responding to input tokens of different lengths"
+ num_iter=${1}
+ input_tokens_length_li=${2}
+ output_tokens_length=${3}
+ choice_dir=${4}
+
+ # server
+ $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_concurrent_queries $VALUE_INF --vllm_max_num_seqs $VALUE_INF
+
+ # client
+ for i in $(seq 1 $num_iter)
+ do
+ echo "Run iter $i"
+ iter_dir=$choice_dir"/iter_"$i
+ for input_tokens_length in ${input_tokens_length_li}
+ do
+ echo "Run input_tokens_length ${input_tokens_length}"
+ token_dir=$iter_dir"/tokens_"$input_tokens_length"_"$output_tokens_length
+ $NUMA_CLIENT_COMMAND python $BENCHMARK_SCRIPT --model-endpoint-base $MODEL_ENDPOINT --model-name $MODEL_NAME --dataset $DATASET_IPEX_PATH --num-prompts 1 --dataset-format IPEX --input-tokens $input_tokens_length --max-new-tokens $output_tokens_length --track-token-latency --vllm-engine --simple --results-dir $token_dir
+ done
+ done
+ echo "CHOICE 4 generation completed"
+}
+
+if [[ "$CHOICE" == *"1"* ]]
+then
+ benchmark_dir=$SAVE_DIR"/choice_1"
+ echo "results will be saved in $benchmark_dir"
+ # get the results of choice1(the peak output throughput of llm-on-ray with vllm)
+ if [ "$RUN_MODE" == "benchmark" ]
+ then
+ bs=(1 2 4 8 16 32 64 128 256 300 400 512)
+ prompt_num=$DATASET_BENCHMARK_NUM
+ elif [ "$RUN_MODE" == "test" ]
+ then
+ bs=(1 2 4)
+ prompt_num=8
+ fi
+ get_peak_throughpt "${bs[*]}" $prompt_num $benchmark_dir
+fi
+if [[ "$CHOICE" == *"2"* ]]
+then
+ benchmark_dir=$SAVE_DIR"/choice_2"
+ echo "results will be saved in $benchmark_dir"
+ benchmark_dir_vllm=$benchmark_dir"/vllm"
+ benchmark_dir_wo_vllm=$benchmark_dir"/wo_vllm"
+ # get the results of choice2(compare output token throughput(average latency per token) between llm-on-ray with vllm and llm-on-ray)
+ if [ "$RUN_MODE" == "benchmark" ]
+ then
+ bs=(1 2 4 8 16 32 64)
+ prompt_num=$DATASET_COMPARE_NUM
+ elif [ "$RUN_MODE" == "test" ]
+ then
+ bs=(1 2 4)
+ prompt_num=1
+ fi
+ metric_bs "${bs[*]}" $prompt_num $benchmark_dir_vllm $benchmark_dir_wo_vllm
+fi
+if [[ "$CHOICE" == *"3"* ]]
+then
+ benchmark_dir=$SAVE_DIR"/choice_3"
+ echo "results will be saved in $benchmark_dir"
+ # get the results of choice3(latency vs throughput tradeoff for various number of requests)
+ if [ "$RUN_MODE" == "benchmark" ]
+ then
+ iter=10
+ concurrent_query_num=(1 2 4 8 16 32 64)
+ for i in "${!concurrent_query_num[@]}"; do
+ concurrent_query_num[$i]=$[${concurrent_query_num[$i]}*$NUM_REPLICA]
+ done
+ # 32/64
+ input_tokens_length=32
+ output_tokens_length=64
+ latency_throughput $iter "${concurrent_query_num[*]}" $input_tokens_length $output_tokens_length $benchmark_dir
+ # 1024/128
+ input_tokens_length=1024
+ output_tokens_length=128
+ latency_throughput $iter "${concurrent_query_num[*]}" $input_tokens_length $output_tokens_length $benchmark_dir
+ elif [ "$RUN_MODE" == "test" ]
+ then
+ iter=2
+ concurrent_query_num=(1 2 4)
+ input_tokens_length=32
+ output_tokens_length=20
+ latency_throughput $iter "${concurrent_query_num[*]}" $input_tokens_length $output_tokens_length $benchmark_dir
+ fi
+fi
+if [[ "$CHOICE" == *"4"* ]]
+then
+ benchmark_dir=$SAVE_DIR"/choice_4"
+ echo "results will be saved in $benchmark_dir"
+ # get the results of choice4(get the latency of llm-on-Ray with vllm)
+ if [ "$RUN_MODE" == "benchmark" ]
+ then
+ iter=10
+ input_tokens_length=(32 128 1024 2016)
+ elif [ "$RUN_MODE" == "test" ]
+ then
+ iter=2
+ input_tokens_length=(32 128)
+ fi
+ output_tokens_length=32
+ get_best_latency $iter "${input_tokens_length[*]}" $output_tokens_length $benchmark_dir
+fi
diff --git a/docs/assets/choice1_vllm_peak_throughput.png b/docs/assets/choice1_vllm_peak_throughput.png
new file mode 100644
index 000000000..fc6ab4752
Binary files /dev/null and b/docs/assets/choice1_vllm_peak_throughput.png differ
diff --git a/docs/assets/choice2_output_token_throughput_compare.png b/docs/assets/choice2_output_token_throughput_compare.png
new file mode 100644
index 000000000..74c1a04bb
Binary files /dev/null and b/docs/assets/choice2_output_token_throughput_compare.png differ
diff --git a/docs/assets/choice3_tokens_32_64.png b/docs/assets/choice3_tokens_32_64.png
new file mode 100644
index 000000000..5e86a5c8f
Binary files /dev/null and b/docs/assets/choice3_tokens_32_64.png differ
diff --git a/docs/assets/solution_technical_overview.png b/docs/assets/solution_technical_overview.png
new file mode 100644
index 000000000..97f819b63
Binary files /dev/null and b/docs/assets/solution_technical_overview.png differ
diff --git a/docs/assets/webui_deployment.png b/docs/assets/webui_deployment.png
new file mode 100644
index 000000000..8090012ab
Binary files /dev/null and b/docs/assets/webui_deployment.png differ
diff --git a/docs/assets/webui_finetune.png b/docs/assets/webui_finetune.png
new file mode 100644
index 000000000..1055c946f
Binary files /dev/null and b/docs/assets/webui_finetune.png differ
diff --git a/docs/assets/webui_inference.png b/docs/assets/webui_inference.png
new file mode 100644
index 000000000..e1883e3f3
Binary files /dev/null and b/docs/assets/webui_inference.png differ
diff --git a/docs/benchmark_visualize.md b/docs/benchmark_visualize.md
new file mode 100644
index 000000000..4fca7ac94
--- /dev/null
+++ b/docs/benchmark_visualize.md
@@ -0,0 +1,49 @@
+The `benchmark_visualize.py` script is designed for visualizing benchmark results and generate figures. The performance data it uses comes from the results generated by `run_benchmark.sh`. Two modes are supported when generating performance data, namely "test" and "benchmark". "test" only selects a small part of dataset, allowing users to view the generated figures quickly. Then please choose "benchmark" mode to get normalized results.
+### Command
+```bash
+# step 1: start ray cluster
+# start head node on a machine
+numactl -N 0 -m 0 ray start --head --include-dashboard False --num-cpus 0
+# start worker nodes on another machine (please configure 27cores * 4replicas in config file accordingly)
+numactl -N 0 -m 0 -C 0-55 ray start --address='$HEAD_NODE_IP:PORT' --num-cpus 56
+numactl -N 1 -m 1 -C 56-111 ray start --address='$HEAD_NODE_IP:PORT' --num-cpus 56
+# step 2: generate performance results
+bash benchmarks/run_benchmark.sh 1,2,3,4 "benchmark"
+# step 3: generate figure based on results
+python benchmarks/benchmark_visualize.py
+```
+Visualize script supports the generation of four scenes. They will all be generated by default. You can set choice parameter to get specified figures.
+```bash
+choice=1
+# choice=1,2,3
+bash benchmarks/run_benchmark.sh $choice "benchmark"
+python benchmarks/benchmark_visualize.py --choice $choice
+```
+### Generated results of different Choices
+- Choice 1: the peak output token throughput of llm-on-ray with vllm
+
+ This choice will get output token throughput according to different batch_size, then we can get the peak data clearly based on this figure.
+
+
+
+- Choice 2: compare output token throughput(average latency per token) between llm-on-ray with vllm and llm-on-ray
+
+ This choice will generate two figures, one compares the output token throughput between llm-on-ray with vllm and llm-on-ray(sample figure), and the other compares the average latency per token. From these two figures, we can conclude that vllm has a significant improvement of inference performance as batch_size increases.
+
+
+
+- Choice 3: latency vs throughput tradeoff for various number of requests
+
+ According to the results generated by this choice, we can determine the maximum number of requests based on limited latency and throughput. The results of two sets of parameters will be generated, input token length/output token length of the one is 32/64(sample figure) and the other is 1024/128.
+
+
+
+- Choice 4: get the latency of llm-on-ray with vllm
+
+ This choice will list the latecy(first token latecy and next token latency) for different input token length.
+ ```bash
+ Results:
+ input_tokens_length: [32, 128]
+ avg_latency_first_token: [0.897, 1.3145]
+ avg_latency_next_token: [0.383, 0.39]
+ ```
\ No newline at end of file
diff --git a/docs/web_ui.md b/docs/web_ui.md
index 5207c736f..0cc872341 100644
--- a/docs/web_ui.md
+++ b/docs/web_ui.md
@@ -21,19 +21,19 @@ You will get URL from the command line output (E.g. http://0.0.0.0:8080 for loca
## Finetune LLMs
On the `Finetune` tab, you can configure the base model, finetuning parameters, the dataset path and the new model name. Click `Start To Finetune` to start finetuning.
-![webui1](https://github.com/intel/llm-on-ray/assets/9278199/895be765-13d3-455e-a00d-c9ba67ac6781)
+![webui_finetune](./assets/webui_finetune.png)
## Deploy and Serve LLM
On the `Deployment` tab, you can choose a model to deploy, configure parameter `Model Replica Number`, `Cpus per Worker` and `Gpus per Worker`. Click `Deploy` and you will get a model endpoint.
-![webui2](https://github.com/intel/llm-on-ray/assets/9278199/2a1fb8f2-a2a8-4868-9d1c-418c5c2a6180)
+![webui_deployment](./assets/webui_deployment.png)
## Chatbot
On the `Inference` tab, you can now test the model by asking questions.
-![webui3](https://github.com/intel/llm-on-ray/assets/9278199/f7b9dc79-92fe-4e75-85fa-2cf7f36bb58d)
+![webui_inference](./assets/webui_inference.png)
diff --git a/pyproject.toml b/pyproject.toml
index dca007f93..6a3c44685 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,7 +34,8 @@ dependencies = [
"py-cpuinfo",
"pydantic-yaml",
"async_timeout",
- "typer"
+ "typer",
+ "matplotlib"
]
[project.optional-dependencies]
diff --git a/tests/benchmarks/test_benchmark_visualize.py b/tests/benchmarks/test_benchmark_visualize.py
new file mode 100644
index 000000000..9718eb2c5
--- /dev/null
+++ b/tests/benchmarks/test_benchmark_visualize.py
@@ -0,0 +1,71 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import pytest
+import subprocess
+import re
+
+figures_name = [
+ "choice1_vllm_peak_throughput.png",
+ "choice2_average_latency_per_token_compare.png",
+ "choice3_tokens_32_20.png",
+]
+
+
+def script_with_args(choice):
+ global figures_name
+ current_path = os.path.dirname(os.path.abspath(__file__))
+ benchmark_script = os.path.join(current_path, "../../benchmarks/run_benchmark.sh")
+ visualize_script = os.path.join(current_path, "../../benchmarks/benchmark_visualize.py")
+ save_dir = os.path.join(current_path, "../../benchmarks/figures")
+
+ cmd_bench = ["bash", benchmark_script, str(choice), "test"]
+ cmd_visual = [
+ "python",
+ visualize_script,
+ "--choice",
+ str(choice),
+ "--run-mode",
+ "test",
+ ]
+ result_bench = subprocess.run(cmd_bench, capture_output=True, text=True)
+ assert "Error" not in result_bench.stderr
+ assert result_bench.returncode == 0
+ print("Output of result_bench:", result_bench)
+ result_visual = subprocess.run(cmd_visual, capture_output=True, text=True)
+ print(result_visual)
+ assert "Error" not in result_visual.stderr
+ assert result_visual.returncode == 0
+ print("Output of stderr:", result_visual.stderr)
+ bs = re.search(r"bs:\s+(\[.*?\])", result_visual.stdout)
+ if bs:
+ bs = bs.group(1)
+ iter = re.search(r"iter:\s+(\[.*?\])", result_visual.stdout)
+ if iter:
+ iter = iter.group(1)
+ assert (bs == "[1, 2, 4]") or (iter == "[1, 2]")
+ if choice in [1, 2, 3]:
+ file_path = os.path.join(save_dir, figures_name[choice - 1])
+ assert os.path.exists(file_path)
+
+
+@pytest.mark.parametrize(
+ "choice",
+ [(choice) for choice in [1, 2, 3, 4]],
+)
+def test_script(choice):
+ script_with_args(choice)
diff --git a/tests/run-tests-benchmark.sh b/tests/run-tests-benchmark.sh
new file mode 100755
index 000000000..54fb001fa
--- /dev/null
+++ b/tests/run-tests-benchmark.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+set -eo pipefail
+cd $(dirname $0)
+
+
+# Run pytest with the test file
+pytest -vv --capture=tee-sys --show-capture=all ./benchmarks
+
+echo "Pytest finished running tests."