Merge pull request #150 from XueSongTap/dev

add Impl for llm edge benchmark suite
kubeedge · Oct 30, 2024 · 27519de · 27519de
2 parents 432d321 + 5b1a925
commit 27519de
Show file tree

Hide file tree

Showing 24 changed files with 696 additions and 1 deletion.
diff --git a/core/testcasecontroller/algorithm/algorithm.py b/core/testcasecontroller/algorithm/algorithm.py
@@ -77,6 +77,9 @@ def __init__(self, name, config):
         self.initial_model_url: str = ""
         self.modules: list = []
         self.modules_list = None
+        self.mode: str = ""
+        self.quantization_type: str = ""
+        self.llama_quantize_path: str = ""
         self._parse_config(config)
         self._load_third_party_packages()
 

diff --git a/core/testcasecontroller/algorithm/paradigm/singletask_learning/singletask_learning.py b/core/testcasecontroller/algorithm/paradigm/singletask_learning/singletask_learning.py
@@ -15,7 +15,7 @@
 """Single Task Learning Paradigm"""
 
 import os
-
+import subprocess
 from core.common.constant import ParadigmType
 from core.testcasecontroller.algorithm.paradigm.base import ParadigmBase
 
@@ -49,6 +49,11 @@ class SingleTaskLearning(ParadigmBase):
     def __init__(self, workspace, **kwargs):
         ParadigmBase.__init__(self, workspace, **kwargs)
         self.initial_model = kwargs.get("initial_model_url")
+        self.mode = kwargs.get("mode")
+        self.quantization_type = kwargs.get("quantization_type")
+        self.llama_quantize_path = kwargs.get("llama_quantize_path")
+        if kwargs.get("use_gpu", True):
+            os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 
     def run(self):
         """
@@ -66,10 +71,43 @@ def run(self):
 
         trained_model = self._train(job, self.initial_model)
 
+        if trained_model is None:
+            trained_model = self.initial_model
+
+        if self.mode == 'with_compression':
+            trained_model = self._compress(trained_model)
+
         inference_result = self._inference(job, trained_model)
 
         return inference_result, self.system_metric_info
 
+
+    def _compress(self, trained_model):
+        if not os.path.exists(trained_model):
+            return None
+
+        if self.llama_quantize_path is None or not os.path.exists(self.llama_quantize_path):
+            return None
+
+        if self.quantization_type is None:
+            return None
+
+        compressed_model = trained_model.replace('.gguf', f'_{self.quantization_type}.gguf')
+
+        command = [
+            self.llama_quantize_path,
+            trained_model,
+            compressed_model,
+            self.quantization_type
+        ]
+
+        try:
+            subprocess.run(command, check=True)
+        except subprocess.CalledProcessError as _:
+            return trained_model
+
+        return compressed_model
+
     def _train(self, job, initial_model):
         train_output_dir = os.path.join(self.workspace, "output/train/")
         os.environ["BASE_MODEL_URL"] = initial_model

diff --git a/core/testenvmanager/testenv/testenv.py b/core/testenvmanager/testenv/testenv.py
@@ -46,6 +46,7 @@ def __init__(self, config):
         self.round = 1
         self.client_number = 1
         self.dataset = None
+        self.use_gpu = False  # default false
         self._parse_config(config)
 
     def _check_fields(self):
@@ -64,6 +65,8 @@ def _parse_config(self, config):
         for k, v in config_dict.items():
             if k == str.lower(Dataset.__name__):
                 self.dataset = Dataset(v)
+            elif k == 'use_gpu':
+                self.use_gpu = bool(v)
             else:
                 if k in self.__dict__:
                     self.__dict__[k] = v

diff --git a/examples/llm-edge-benchmark-suite/README.md b/examples/llm-edge-benchmark-suite/README.md
@@ -0,0 +1,41 @@
+Large Language Model Edge Benchmark Suite: Implementation on KubeEdge-lanvs
+
+
+## dataset
+
+### Prepare Data
+
+The data of llm-edge-benchmark-suite example structure is:
+
+```
+.
+├── test_data
+│   └── data.jsonl
+└── train_data
+    └── data.jsonl
+```
+
+`train_data/data.jsonl` is empty, and the `test_data/data.jsonl` is as follows:
+
+```
+{"question": "Which of the following numbers is the smallest prime number?\nA. 0\nB. 1\nC. 2\nD. 4", "answer": "C"}
+```
+### prepare env
+
+```shell
+python setup.py install
+```
+
+### Run Ianvs
+
+
+
+```shell
+ianvs -f examples/llm-edge-benchmark-suite/single_task_bench/benchmarkingjob.yaml
+```
+
+
+```shell
+ianvs -f examples/llm-edge-benchmark-suite/single_task_bench_with_compression/benchmarkingjob.yaml
+```
+
diff --git a/examples/llm-edge-benchmark-suite/single_task_bench/README.md b/examples/llm-edge-benchmark-suite/single_task_bench/README.md
@@ -0,0 +1,2 @@
+Large Language Model Edge Benchmark Suite: Implementation on KubeEdge-lanvs
+
diff --git a/examples/llm-edge-benchmark-suite/single_task_bench/benchmarkingjob.yaml b/examples/llm-edge-benchmark-suite/single_task_bench/benchmarkingjob.yaml
@@ -0,0 +1,30 @@
+benchmarkingjob:
+  name: "benchmarkingjob"
+  workspace: "./workspace"
+
+  testenv: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testenv/testenv.yaml"
+
+  test_object:
+    type: "algorithms"
+    algorithms:
+      - name: "llama-cpp"
+        url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/algorithm.yaml"
+
+  rank:
+    sort_by: 
+      - { "latency": "descend" }
+      - { "throughput": "ascend" }
+      - { "mem_usage": "ascend" }
+      - { "prefill_latency": "ascend"}
+
+    visualization:
+      mode: "selected_only"
+      method: "print_table"
+
+    selected_dataitem:
+      paradigms: [ "all" ]
+      modules: [ "all" ]
+      hyperparameters: [ "all" ]
+      metrics: [ "latency", "throughput", "prefill_latency" ]
+
+    save_mode: "selected_and_all"
diff --git a/examples/llm-edge-benchmark-suite/single_task_bench/testalgorithms/algorithm.yaml b/examples/llm-edge-benchmark-suite/single_task_bench/testalgorithms/algorithm.yaml
@@ -0,0 +1,16 @@
+algorithm:
+  paradigm_type: "singletasklearningwithcompression"
+
+  initial_model_url: "models/qwen/qwen_1_5_0_5b.gguf"
+
+  modules:
+    - type: "basemodel"
+      name: "LlamaCppModel"
+      url: "./examples/llm-edge-benchmark-suite/single_task_bench_with_compression/testalgorithms/basemodel.py"
+      hyperparameters:
+        - model_path:
+            values:
+              - "models/qwen/qwen_1_5_0_5b.gguf"
+        - n_ctx:
+            values:
+              - 2048
diff --git a/examples/llm-edge-benchmark-suite/single_task_bench/testalgorithms/basemodel.py b/examples/llm-edge-benchmark-suite/single_task_bench/testalgorithms/basemodel.py
@@ -0,0 +1,135 @@
+from sedna.common.class_factory import ClassFactory, ClassType
+from llama_cpp import Llama
+from contextlib import redirect_stderr
+import os
+import psutil
+import time
+import io
+import statistics
+import logging
+
+logging.getLogger().setLevel(logging.INFO)
+
+@ClassFactory.register(ClassType.GENERAL, alias="LlamaCppModel")
+class LlamaCppModel:
+    def __init__(self, **kwargs):
+        """
+        init llama-cpp
+        """
+        model_path = kwargs.get("model_path")
+        if not model_path:
+            raise ValueError("Model path is required.")
+        quantization_type = kwargs.get("quantization_type", None)
+        if quantization_type:
+            logging.info(f"Using quantization type: {quantization_type}")
+        # Init LLM model
+        self.model = Llama(
+            model_path=model_path,
+            n_ctx=kwargs.get("n_ctx", 512),
+            n_gpu_layers=kwargs.get("n_gpu_layers", 0),
+            seed=kwargs.get("seed", -1),
+            f16_kv=kwargs.get("f16_kv", True),
+            logits_all=kwargs.get("logits_all", False),
+            vocab_only=kwargs.get("vocab_only", False),
+            use_mlock=kwargs.get("use_mlock", False),
+            embedding=kwargs.get("embedding", False),
+        )
+
+    def predict(self, data, input_shape=None, **kwargs):
+        data = data[:10]
+        process = psutil.Process(os.getpid())
+        start_time = time.time()
+
+        results = []
+        total_times = []
+        prefill_latencies = []
+        mem_usages = []
+
+        for prompt in data:
+            prompt_start_time = time.time()
+
+            f = io.StringIO()
+            with redirect_stderr(f):
+                output = self.model(
+                    prompt=prompt,
+                    max_tokens=kwargs.get("max_tokens", 32),
+                    stop=kwargs.get("stop", ["Q:", "\n"]),
+                    echo=kwargs.get("echo", True),
+                    temperature=kwargs.get("temperature", 0.8),
+                    top_p=kwargs.get("top_p", 0.95),
+                    top_k=kwargs.get("top_k", 40),
+                    repeat_penalty=kwargs.get("repeat_penalty", 1.1),
+                )
+            stdout_output = f.getvalue()
+
+            # parse timing info
+            timings = self._parse_timings(stdout_output)
+            prefill_latency = timings.get('prompt_eval_time', 0.0)  # ms
+            generated_text = output['choices'][0]['text']
+
+            prompt_end_time = time.time()
+            prompt_total_time = (prompt_end_time - prompt_start_time) * 1000  # convert to ms
+
+            result_with_time = {
+                "generated_text": generated_text,
+                "total_time": prompt_total_time,
+                "prefill_latency": prefill_latency,
+                "mem_usage":process.memory_info().rss,
+            }
+
+            results.append(result_with_time)
+
+        predict_dict = {
+            "results": results,
+        }
+
+        return predict_dict
+
+    def _parse_timings(self, stdout_output):
+        import re
+        timings = {}
+        for line in stdout_output.split('\n'):
+            match = re.match(r'llama_print_timings:\s*(.+?)\s*=\s*([0-9\.]+)\s*ms', line)
+            if match:
+                key = match.group(1).strip()
+                value = float(match.group(2))
+
+                key = key.lower().replace(' ', '_')
+                timings[key] = value
+
+        return timings
+
+    def evaluate(self, data, model_path=None, **kwargs):
+        """
+        evaluate model
+        """
+        if data is None or data.x is None:
+            raise ValueError("Evaluation data is None.")
+
+        if model_path:
+            self.load(model_path)
+
+        # do predict
+        predict_dict = self.predict(data.x, **kwargs)
+
+        # compute metrics
+        metric = kwargs.get("metric")
+        if metric is None:
+            raise ValueError("No metric provided in kwargs.")
+
+        metric_name, metric_func = metric  
+
+        if callable(metric_func):
+            metric_value = metric_func(None, predict_dict["results"])
+            return {metric_name: metric_value}
+        else:
+            raise ValueError(f"Metric function {metric_name} is not callable or not provided.")
+
+    def save(self, model_path):
+        pass
+
+    def load(self, model_url):
+        pass
+
+    def train(self, train_data, valid_data=None, **kwargs):
+        return
diff --git a/...es/llm-edge-benchmark-suite/single_task_bench/testalgorithms/download_model_modelscope.py b/...es/llm-edge-benchmark-suite/single_task_bench/testalgorithms/download_model_modelscope.py
@@ -0,0 +1,25 @@
+import os
+import argparse
+import logging
+from modelscope import snapshot_download
+
+logging.getLogger().setLevel(logging.INFO)
+
+def download_model(model_id, revision, local_dir):
+    try:
+        model_dir = snapshot_download(model_id, revision=revision, cache_dir=local_dir)
+        logging.info(f"Model successfully downloaded to: {model_dir}")
+        return model_dir
+    except Exception as e:
+        logging.info(f"Error downloading model: {str(e)}")
+        return None
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Download a model from ModelScope")
+    parser.add_argument("--model_id", type=str, required=True, help="ModelScope model ID")
+    parser.add_argument("--revision", type=str, default="master", help="Model revision")
+    parser.add_argument("--local_dir", type=str, required=True, help="Local directory to save the model")
+
+    args = parser.parse_args()
+
+    download_model(args.model_id, args.revision, args.local_dir)
diff --git a/examples/llm-edge-benchmark-suite/single_task_bench/testenv/latency.py b/examples/llm-edge-benchmark-suite/single_task_bench/testenv/latency.py
@@ -0,0 +1,29 @@
+# Copyright 2023 The KubeEdge Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from sedna.common.class_factory import ClassType, ClassFactory
+import statistics
+
+__all__ = ["latency"]
+
+
+@ClassFactory.register(ClassType.GENERAL, alias="latency")
+def latency(y_true, y_pred):
+    results_list = y_pred.get('results', [])
+    num_requests = len(results_list)
+    total_latency = 0.0
+    for result in results_list:
+        total_latency += result['total_time']
+    average_latency = total_latency / num_requests
+    return average_latency
diff --git a/examples/llm-edge-benchmark-suite/single_task_bench/testenv/mem_usage.py b/examples/llm-edge-benchmark-suite/single_task_bench/testenv/mem_usage.py
@@ -0,0 +1,13 @@
+from sedna.common.class_factory import ClassType, ClassFactory
+
+__all__ = ["mem_usage"]
+
+@ClassFactory.register(ClassType.GENERAL, alias="mem_usage")
+def mem_usage(y_true, y_pred):
+    results_list = y_pred.get('results', [])
+    total_mem_usage = 0.0
+    num_requests = len(results_list)
+    for result in results_list:
+        total_mem_usage += result['mem_usage']
+    average_mem_usage = total_mem_usage / num_requests
+    return average_mem_usage