From ed3545558cd4291f4313be89e98510ea210ddf67 Mon Sep 17 00:00:00 2001
From: songhappy <guoqiongsong@gmail.com>
Date: Fri, 27 Oct 2023 00:58:23 -0700
Subject: [PATCH] update

---
 python/llm/example/CPU/applications/streaming-llm/README.md    | 2 +-
 .../CPU/applications/streaming-llm/streaming_llm/utils.py      | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/llm/example/CPU/applications/streaming-llm/README.md b/python/llm/example/CPU/applications/streaming-llm/README.md
index a67c72e25b0..f4a79006db5 100644
--- a/python/llm/example/CPU/applications/streaming-llm/README.md
+++ b/python/llm/example/CPU/applications/streaming-llm/README.md
@@ -1,6 +1,6 @@
 # Low-Bit Streaming LLM using BigDL-LLM
 
-In this example, we apply [Streaming-LLM](https://github.com/mit-han-lab/streaming-llm/tree/main#efficient-streaming-language-models-with-attention-sinks) using BigDL-LLM, which can deploy low-bit(including INT8/INT5/INT4) LLMs for infinite-length inputs.
+In this example, we apply [Streaming-LLM](https://github.com/mit-han-lab/streaming-llm/tree/main#efficient-streaming-language-models-with-attention-sinks) using BigDL-LLM, which can deploy low-bit(including FP4/INT4/FP8/INT8) LLMs for infinite-length inputs.
 Only one code change is needed to load the model using bigdl-llm as follows:
 ```python
 from bigdl.llm.transformers import AutoModelForCausalLM
diff --git a/python/llm/example/CPU/applications/streaming-llm/streaming_llm/utils.py b/python/llm/example/CPU/applications/streaming-llm/streaming_llm/utils.py
index dd15b25b66e..2aa1ded15d0 100644
--- a/python/llm/example/CPU/applications/streaming-llm/streaming_llm/utils.py
+++ b/python/llm/example/CPU/applications/streaming-llm/streaming_llm/utils.py
@@ -48,6 +48,7 @@
 import urllib.request
 import os
 import json
+# code change to import from bigdl-llm API instead of using transformers API
 from bigdl.llm.transformers import AutoModelForCausalLM
 from transformers import LlamaTokenizer
 import intel_extension_for_pytorch as ipex
@@ -61,6 +62,8 @@ def load(model_name_or_path):
         trust_remote_code=True,
     )
 
+# set load_in_4bit=True to get performance boost, set optimize_model=False for now
+# TODO align logics of optimize_model and streaming
     model = AutoModelForCausalLM.from_pretrained(
         model_name_or_path,
         load_in_4bit=True,