From ed3545558cd4291f4313be89e98510ea210ddf67 Mon Sep 17 00:00:00 2001 From: songhappy Date: Fri, 27 Oct 2023 00:58:23 -0700 Subject: [PATCH] update --- python/llm/example/CPU/applications/streaming-llm/README.md | 2 +- .../CPU/applications/streaming-llm/streaming_llm/utils.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/python/llm/example/CPU/applications/streaming-llm/README.md b/python/llm/example/CPU/applications/streaming-llm/README.md index a67c72e25b0..f4a79006db5 100644 --- a/python/llm/example/CPU/applications/streaming-llm/README.md +++ b/python/llm/example/CPU/applications/streaming-llm/README.md @@ -1,6 +1,6 @@ # Low-Bit Streaming LLM using BigDL-LLM -In this example, we apply [Streaming-LLM](https://github.com/mit-han-lab/streaming-llm/tree/main#efficient-streaming-language-models-with-attention-sinks) using BigDL-LLM, which can deploy low-bit(including INT8/INT5/INT4) LLMs for infinite-length inputs. +In this example, we apply [Streaming-LLM](https://github.com/mit-han-lab/streaming-llm/tree/main#efficient-streaming-language-models-with-attention-sinks) using BigDL-LLM, which can deploy low-bit(including FP4/INT4/FP8/INT8) LLMs for infinite-length inputs. Only one code change is needed to load the model using bigdl-llm as follows: ```python from bigdl.llm.transformers import AutoModelForCausalLM diff --git a/python/llm/example/CPU/applications/streaming-llm/streaming_llm/utils.py b/python/llm/example/CPU/applications/streaming-llm/streaming_llm/utils.py index dd15b25b66e..2aa1ded15d0 100644 --- a/python/llm/example/CPU/applications/streaming-llm/streaming_llm/utils.py +++ b/python/llm/example/CPU/applications/streaming-llm/streaming_llm/utils.py @@ -48,6 +48,7 @@ import urllib.request import os import json +# code change to import from bigdl-llm API instead of using transformers API from bigdl.llm.transformers import AutoModelForCausalLM from transformers import LlamaTokenizer import intel_extension_for_pytorch as ipex @@ -61,6 +62,8 @@ def load(model_name_or_path): trust_remote_code=True, ) +# set load_in_4bit=True to get performance boost, set optimize_model=False for now +# TODO align logics of optimize_model and streaming model = AutoModelForCausalLM.from_pretrained( model_name_or_path, load_in_4bit=True,