From 82a1e21f5c26231c82c86d3885b7766ad5e12f6c Mon Sep 17 00:00:00 2001
From: KiwiHana <102839943+KiwiHana@users.noreply.github.com>
Date: Wed, 6 Sep 2023 08:19:35 +0800
Subject: [PATCH 01/15] =?UTF-8?q?Create=20=E5=A4=A7=E8=AF=AD=E8=A8=80?=
 =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91=E6=8C=87?=
 =?UTF-8?q?=E5=8D=97.md?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...00\345\217\221\346\214\207\345\215\227.md" | 294 ++++++++++++++++++
 1 file changed, 294 insertions(+)
 create mode 100644 "\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"

diff --git "a/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"
new file mode 100644
index 0000000..499b122
--- /dev/null
+++ "b/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"
@@ -0,0 +1,294 @@
+# 基于Intel 13代CPU的大语言模型应用开发指南
+
+本文档介绍如何开发大语言模型应用UI，基于开源的intel bigdl-llm库和gradio。UI跑在windows11 x86 CPU上，实现在PC 16GB内存上运行优化的Native INT4 大语言模型。以三个大语言模型为例，ChatGLM2 (6B)中英，LLaMA2 (13B)英，StarCoder (15.5B)中英。
+## 1 安装环境
+（1）Windows11安装Miniconda3-py39_23.5.2-0-Windows-x86_64.exe，下载链接：
+https://docs.conda.io/en/latest/miniconda.html#windows-installers 
+
+（2）打开Anaconda Powershell Prompt窗口
+```
+ conda create -n llm python=3.9
+ conda activate llm
+ pip install --pre --upgrade bigdl-llm[all]
+ pip install gradio mdtex2html
+```
+或者用指定版本的方式安装
+```
+ pip install --pre bigdl-llm[all]==2.4.0b20230820 -i https://pypi.tuna.tsinghua.edu.cn/simple
+```
+## 2 LLM模型转换
+以Chatglm2，llama2，starcoder为例，下载hugging face FP16模型。模型下载链接：
+
+· ChatGLM2-6B：https://huggingface.co/THUDM/chatglm2-6b/tree/main
+
+· Llama2-13B: https://huggingface.co/meta-llama/Llama-2-13b-chat-hf/tree/main
+
+· StarCoder: https://huggingface.co/bigcode/starcoder/tree/main
+
+### 2.1 FP16转Native INT4模型，并调用python函数 (推荐运行在CPU)
+Chatglm2 ，llama2，starcoder转native INT4。
+
+打开Anaconda PowerShell，修改模型路径和输出文件夹名称，并运行：
+```
+ conda activate llm
+ llm-convert "C:/llm-models/chatglm2-6b/" --model-format pth --model-family "chatglm" --outfile "checkpoint/"
+ llm-convert "C:/llm-models/llama-2-13b-chat-hf/" --model-format pth --model-family "llama" --outfile "checkpoint/"
+ llm-convert "C:/llm-models/starcoder/" --model-format pth --model-family "starcoder" --outfile "checkpoint/"
+```
+Note：starcoder用16GB内存的机器转不了Native INT4，因为内存不够。建议转starcoder native INT4用更大的内存的机器。
+
+#### python调用Native INT4模型。
+参数解释：
+
+（1）n_threads=CPU大核数*2+小核数 或者 
+
+n_threads=CPU大核数*2+小核数 - 1 或者 
+
+n_threads=CPU大核数*2+小核数 -2
+
+不同设备可以尝试这3个参数，选择一个最优参数。
+
+对于Xeon，OMP_NUM_THREADS 和 n_threads是第一个socket的物理核数量
+```
+export OMP_NUM_THREADS=48
+numactl -C 0-47 -m 0 jupyter notebook
+```
+（2）n_ctx=4096表示模型最长的输入+输出文本等于4096 tokens
+```
+from bigdl.llm.ggml.model.chatglm.chatglm import ChatGLM
+from bigdl.llm.transformers import BigdlNativeForCausalLM
+model_name = "chatglm2-6b"
+model_all_local_path = "C:\\PC_LLM\\checkpoint\\"
+if model_name == "chatglm2-6b":
+    model = ChatGLM(model_all_local_path + "\\ggml-chatglm2-6b-q4_0.bin", n_threads=20,n_ctx=4096) 
+
+elif model_name == "llama2-13b":
+    model = BigdlNativeForCausalLM.from_pretrained(
+    pretrained_model_name_or_path=model_all_local_path + "\\bigdl_llm_llama2_13b_q4_0.bin",
+    model_family='llama',n_threads=20,n_ctx=4096)
+elif model_name == "StarCoder":
+    model = BigdlNativeForCausalLM.from_pretrained(
+    pretrained_model_name_or_path=model_all_local_path + "\\bigdl_llm_starcoder_q4_0.bin",
+    model_family='starcoder',n_threads=20,n_ctx=4096)
+```
+### 2.2 FP16转transformer INT4，并调用python函数
+Transformer INT4在CPU上运行性能比Native INT4低一些。
+
+用python脚本转换模型为transformer INT4
+```
+from bigdl.llm.transformers import AutoModel
+from transformers import AutoTokenizer
+from bigdl.llm.transformers import AutoModelForCausalLM
+model_name = "chatglm2-6b"
+model_all_local_path = "C:\\PC_LLM\\checkpoint\\"
+model_name_local = model_all_local_path + model_name
+
+if model_name == "chatglm2-6b":
+    tokenizer = AutoTokenizer.from_pretrained(model_name_local, trust_remote_code=True)
+    model = AutoModel.from_pretrained(model_name_local, trust_remote_code=True, load_in_4bit=True)
+    model.save_low_bit("D:\\llm-models\\chatglm2-6b-int4\\")
+    tokenizer.save_pretrained("D:\\llm-models\\chatglm2-6b-int4\\")
+
+elif model_name == "llama2-13b" or model_name == "StarCoder":
+    tokenizer = AutoTokenizer.from_pretrained(model_name_local, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(model_name_local, trust_remote_code=True, load_in_4bit=True)
+    model.save_low_bit("D:\\llm-models\\"+model_name)
+    tokenizer.save_pretrained("D:\\llm-models\\"+model_name)
+
+python调用transformer INT4模型
+if model_name == "chatglm2-6b":
+    model = AutoModel.load_low_bit(model_name_local,trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_name_local,trust_remote_code=True)
+    model = model.eval()
+elif model_name == "llama2-13b" or model_name == "StarCoder":
+    model = AutoModelForCausalLM.load_low_bit(model_name_local,trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_name_local,trust_remote_code=True)
+    model = model.eval()
+```
+## 3 测试LLM benchmark on CPU
+使用Native INT4模型测试LLM benchmark on CPU将会使用所有核，方便和应用UI的性能指标相比较。
+
+打开Anaconda PowerShell Prompt
+```
+ conda activate llm
+ChatGLM2: 
+ llm-cli -t 20 -x chatglm -m "ggml-chatglm2-6b-q4_0.bin" -p "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" --no-mmap -v -n 32
+Llama2: 
+ llm-cli -t 20 -x llama -m "bigdl_llm_llama2_13b_q4_0.bin" -p "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" --no-mmap -n 32
+Starcoder: 
+ llm-cli -t 20 -x starcoder -m "bigdl_llm_starcoder_q4_0.bin" -p "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" --no-mmap -n 32
+```
+参数解释：-n 32限制输出32 tokens。
+
+从command line提取性能信息，如图：
+
+Input token: 32 tokens 
+
+Output token: 32 tokens (31 runs = 32 tokens – 1st token)
+
+1st token avg latency (ms) = 1541.56 ms 
+
+2nd+ token avg latency (ms/token) = 125.62 ms per token
+
+图1：llm-cli的输出
+
+## 4 用吐字的方式输出文本
+### 4.1（推荐运行在CPU）：Native int4 for chatglm2，llama2和starcoder。
+```
+from bigdl.llm.ggml.model.chatglm.chatglm import ChatGLM
+model_name = "chatglm2-6b"
+model_all_local_path = "C:\\PC_LLM\\checkpoint\\"
+prompt = "What is AI?"
+if model_name == "chatglm2-6b":
+    model = ChatGLM(model_all_local_path + "\\ggml-chatglm2-6b-q4_0.bin", n_threads=20,n_ctx=4096)
+    response = ""
+    for chunk in model(prompt, temperature=0.95,top_p=0.8,stream=True,max_tokens=512):
+        response += chunk['choices'][0]['text']
+```
+llama2和starcoder的吐字调用方式相同，也是用for循环。
+
+参数说明：
+
+· 温度（Temperature）（数值越高，输出的随机性增加），可调范围0~1
+
+· Top P（数值越高，词语选择的多样性增加），可调范围0~1
+
+· 输出最大长度（Max Length）（输出文本的最大tokens），可调范围0~2048，上限由模型决定。这三个模型n_ctx最大8k，输入+输出tokens应小于8k。
+
+### 4.2 Transformer INT4 stream_chat仅限chatglm2
+```
+from bigdl.llm.transformers import AutoModel
+from transformers import AutoTokenizer
+import torch
+model_name = "chatglm2-6b"
+model_all_local_path = "C:\\PC_LLM\\checkpoint\\"
+model_name_local = model_all_local_path + model_name
+prompt = "What is AI?"
+model = AutoModel.load_low_bit(model_name_local,trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained(model_name_local,trust_remote_code=True)
+model = model.eval()
+with torch.inference_mode():
+    for response, history in model.stream_chat(tokenizer, prompt, history,max_length=512, top_p=0.9,temperature=0.9):
+        print(response)
+```
+### 4.3 Transformer INT4 TextIteratorStreamer for chatglm2，llama2和starcoder。
+```
+from bigdl.llm.transformers import AutoModel
+from transformers import AutoTokenizer,TextIteratorStreamer
+import torch
+from benchmark_util import BenchmarkWrapper
+
+model_name = "chatglm2-6b"
+model_all_local_path = "C:\\PC_LLM\\checkpoint\\"
+model_name_local = model_all_local_path + model_name
+model = AutoModel.load_low_bit(model_name_local,trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained(model_name_local,trust_remote_code=True)
+model = model.eval()
+prompt = "What is AI?"
+with torch.inference_mode():
+    model=BenchmarkWrapper(model)
+    inputs = tokenizer(prompt, return_tensors="pt")
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    response = ""
+    timeStart = time.time() 
+  #  out = model.generate(**inputs, streamer=streamer, temperature=0.9, top_p=0.9, max_new_tokens=512) 
+    generate_kwargs = dict(**inputs,streamer=streamer,temperature=0.9, top_p=0.9, max_new_tokens=512)
+    from threading import Thread
+    thread = Thread(target=model.generate, kwargs=generate_kwargs)
+    thread.start()
+
+    for new_text in streamer:
+        response += new_text
+    timeCost = time.time() - timeStart
+    token_count_input = len(tokenizer.tokenize(prompt))
+
+    token_count_output = int(out[0,2])+1
+    ms_first_token = float(out[0,0])
+    ms_after_token = float(out[0,1]) ## Tensor out的定义在benchmark_util.py L2476
+```
+
+## 5 添加history多轮对话功能
+### 5.1 仅对chatglm2 Transformer INT4 stream_chat
+代码参考4.2
+### 5.2对于Native int4添加history多轮对话功能
+```
+from bigdl.llm.ggml.model.chatglm.chatglm import ChatGLM
+model_name = "chatglm2-6b"
+model_all_local_path = "C:\\PC_LLM\\checkpoint\\"
+history_round = 0
+history = []
+if model_name == "chatglm2-6b":
+    model = ChatGLM(model_all_local_path + "\\ggml-chatglm2-6b-q4_0.bin", n_threads=20,n_ctx=4096)
+input = "你好"
+predict(input)
+input = "请进行丽江三天必游景点旅游规划"
+predict(input)
+
+def predict(input):
+    global history_round, model, history
+    response = ""
+    if len(model.tokenize(history)) > 2500 or history_round >= 5: ### history record 5 rounds
+        history_round = 0
+        history = [] 
+        print("*********** reset chatbot and history", history)
+
+    if len(history) == 0:
+        print("*********** new chat ")
+        prompt = input
+        history = prompt
+        history_round = 1
+    else:
+        prompt = history + '\n' + input
+        history_round += 1
+    print("******************* history_round ", history_round)
+
+    timeStart = time.time()
+    for chunk in model(prompt, temperature=0.9,top_p=0.9, stream=True,max_tokens=512):
+        response += chunk['choices'][0]['text']
+    history = prompt + response
+    print("******** max_length history",len(model.tokenize(history)))
+```
+### 5.3 对于transformer INT4 TextIteratorStreamer同5.2 
+## 6 用gradio写Web UI
+gradio模板风格库：https://huggingface.co/spaces/gradio/theme-gallery
+
+下载代码：https://github.com/KiwiHana/LLM_UI_Windows_CPU
+
+为了使用全部核，用管理员打开Anaconda Powershell Prompt窗口，运行LLM_demo_v1.0.py 或 LLM_demo_v2.0.py。
+```
+git clone https://github.com/KiwiHana/LLM_UI_Windows_CPU.git
+cd LLM_UI_Windows_CPU
+conda activate llm
+python LLM_demo_v1.0.py
+```
+Note: 修改LLM_demo_v1.0.py脚本第285行 main函数里的模型存放路径，
+
+例如 model_all_local_path = "C:/Users/username/checkpoint/"
+
+· 大语言模型应用UIv1.0文件夹应包含：
+
+LLM_demo_v1.0.py
+
+theme3.json
+
+checkpoint
+
+-- bigdl_llm_llama2_13b_q4_0.bin
+
+-- bigdl_llm_starcoder_q4_0.bin
+
+-- ggml-chatglm2-6b-q4_0.bin
+
+修改Run_Intel_LLM_Demo.bat里环境名称如llm，代码路径。为了使用全部核，用管理员权限打开Run_Intel_LLM_Demo.bat
+```
+D:
+cd D:\PC_LLM_UI\
+call C:\Users\LLM\miniconda3\Scripts\activate.bat C:\Users\LLM\miniconda3
+call conda activate llm
+start python LLM_demo_v1.0.py
+```
+参考链接：
+
+https://github.com/intel-analytics/bigdl-llm-tutorial/tree/main/ch_2_Environment_Setup
+
+https://github.com/intel-analytics/BigDL

From 624b85537e24cd5ec11eb16952f797cf37baabfd Mon Sep 17 00:00:00 2001
From: KiwiHana <102839943+KiwiHana@users.noreply.github.com>
Date: Wed, 6 Sep 2023 08:28:03 +0800
Subject: [PATCH 02/15] =?UTF-8?q?Update=20=E5=A4=A7=E8=AF=AD=E8=A8=80?=
 =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91=E6=8C=87?=
 =?UTF-8?q?=E5=8D=97.md?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...74\200\345\217\221\346\214\207\345\215\227.md" | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git "a/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"
index 499b122..db002af 100644
--- "a/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"
+++ "b/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"
@@ -250,10 +250,12 @@ def predict(input):
 ```
 ### 5.3 对于transformer INT4 TextIteratorStreamer同5.2 
 ## 6 用gradio写Web UI
-gradio模板风格库：https://huggingface.co/spaces/gradio/theme-gallery
-
 下载代码：https://github.com/KiwiHana/LLM_UI_Windows_CPU
 
+![image](https://github.com/KiwiHana/bigdl-llm-tutorial/assets/102839943/5a399c7e-31b4-4337-a6a4-bc6f8bccb93c)
+图2：LLM_UI_Windows_CPU界面
+
+
 为了使用全部核，用管理员打开Anaconda Powershell Prompt窗口，运行LLM_demo_v1.0.py 或 LLM_demo_v2.0.py。
 ```
 git clone https://github.com/KiwiHana/LLM_UI_Windows_CPU.git
@@ -279,14 +281,7 @@ checkpoint
 
 -- ggml-chatglm2-6b-q4_0.bin
 
-修改Run_Intel_LLM_Demo.bat里环境名称如llm，代码路径。为了使用全部核，用管理员权限打开Run_Intel_LLM_Demo.bat
-```
-D:
-cd D:\PC_LLM_UI\
-call C:\Users\LLM\miniconda3\Scripts\activate.bat C:\Users\LLM\miniconda3
-call conda activate llm
-start python LLM_demo_v1.0.py
-```
+
 参考链接：
 
 https://github.com/intel-analytics/bigdl-llm-tutorial/tree/main/ch_2_Environment_Setup

From d3817124a90be845d685938c6081d5e0bd055e2e Mon Sep 17 00:00:00 2001
From: KiwiHana <102839943+KiwiHana@users.noreply.github.com>
Date: Wed, 6 Sep 2023 08:31:28 +0800
Subject: [PATCH 03/15] =?UTF-8?q?Update=20=E5=A4=A7=E8=AF=AD=E8=A8=80?=
 =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91=E6=8C=87?=
 =?UTF-8?q?=E5=8D=97.md?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit


From 89e62e7e59341aa5b917d5065a9f055ddfa872d0 Mon Sep 17 00:00:00 2001
From: KiwiHana <102839943+KiwiHana@users.noreply.github.com>
Date: Fri, 8 Sep 2023 09:34:56 +0800
Subject: [PATCH 04/15] =?UTF-8?q?Rename=20=E5=A4=A7=E8=AF=AD=E8=A8=80?=
 =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91=E6=8C=87?=
 =?UTF-8?q?=E5=8D=97.md=20to=20Chinese=5FVersion=20/ch=5F8=5FApplications?=
 =?UTF-8?q?=E5=A4=A7Chinese=5FVersion=20/ch=5F8=5FApplications=E8=AF=AD?=
 =?UTF-8?q?=E8=A8=80=E6=A8=A1=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91?=
 =?UTF-8?q?=E6=8C=87=E5=8D=97.md?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...47\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename "\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" => "Chinese_Version /ch_8_Applications\345\244\247Chinese_Version /ch_8_Applications\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" (100%)

diff --git "a/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/Chinese_Version /ch_8_Applications\345\244\247Chinese_Version /ch_8_Applications\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"
similarity index 100%
rename from "\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"
rename to "Chinese_Version /ch_8_Applications\345\244\247Chinese_Version /ch_8_Applications\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"

From b636b7d85986aaa0b372ba84791c36b928d783b9 Mon Sep 17 00:00:00 2001
From: KiwiHana <102839943+KiwiHana@users.noreply.github.com>
Date: Fri, 8 Sep 2023 09:35:44 +0800
Subject: [PATCH 05/15] =?UTF-8?q?Rename=20ch=5F8=5FApplications=E8=AF=AD?=
 =?UTF-8?q?=E8=A8=80=E6=A8=A1=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91?=
 =?UTF-8?q?=E6=8C=87=E5=8D=97.md=20to=20=E5=A4=A7=E8=AF=AD=E8=A8=80?=
 =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91=E6=8C=87?=
 =?UTF-8?q?=E5=8D=97.md?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...47\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename "Chinese_Version /ch_8_Applications\345\244\247Chinese_Version /ch_8_Applications\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" => "Chinese_Version /ch_8_Applications/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" (100%)

diff --git "a/Chinese_Version /ch_8_Applications\345\244\247Chinese_Version /ch_8_Applications\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/Chinese_Version /ch_8_Applications/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"
similarity index 100%
rename from "Chinese_Version /ch_8_Applications\345\244\247Chinese_Version /ch_8_Applications\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"
rename to "Chinese_Version /ch_8_Applications/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"

From 48b85bab50f8b2d7f872d3e3cfbb6e96e32040d2 Mon Sep 17 00:00:00 2001
From: KiwiHana <102839943+KiwiHana@users.noreply.github.com>
Date: Fri, 8 Sep 2023 09:36:30 +0800
Subject: [PATCH 06/15] =?UTF-8?q?Rename=20=E5=A4=A7=E8=AF=AD=E8=A8=80?=
 =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91=E6=8C=87?=
 =?UTF-8?q?=E5=8D=97.md=20to=20=E5=9F=BA=E4=BA=8EIntel=2013=E4=BB=A3CPU?=
 =?UTF-8?q?=E7=9A=84=E5=A4=A7=E8=AF=AD=E8=A8=80=E6=A8=A1=E5=9E=8B=E5=BA=94?=
 =?UTF-8?q?=E7=94=A8=E5=BC=80=E5=8F=91=E6=8C=87=E5=8D=97.md?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...47\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename "Chinese_Version /ch_8_Applications/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" => "Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel 13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" (100%)

diff --git "a/Chinese_Version /ch_8_Applications/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel 13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"
similarity index 100%
rename from "Chinese_Version /ch_8_Applications/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"
rename to "Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel 13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"

From 7ffb334b9896484c0552a3eaae026b2b7a94b2fb Mon Sep 17 00:00:00 2001
From: KiwiHana <102839943+KiwiHana@users.noreply.github.com>
Date: Fri, 8 Sep 2023 09:38:47 +0800
Subject: [PATCH 07/15] =?UTF-8?q?Update=20and=20rename=20=E5=9F=BA?=
 =?UTF-8?q?=E4=BA=8EIntel=2013=E4=BB=A3CPU=E7=9A=84=E5=A4=A7=E8=AF=AD?=
 =?UTF-8?q?=E8=A8=80=E6=A8=A1=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91?=
 =?UTF-8?q?=E6=8C=87=E5=8D=97.md=20to=20=E5=9F=BA=E4=BA=8EIntel13=E4=BB=A3?=
 =?UTF-8?q?CPU=E7=9A=84=E5=A4=A7=E8=AF=AD=E8=A8=80=E6=A8=A1=E5=9E=8B?=
 =?UTF-8?q?=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91=E6=8C=87=E5=8D=97.md?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" | 2 --
 1 file changed, 2 deletions(-)
 rename "Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel 13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" => "Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" (99%)

diff --git "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel 13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"
similarity index 99%
rename from "Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel 13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"
rename to "Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"
index db002af..cbb3c5b 100644
--- "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel 13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"	
+++ "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"	
@@ -285,5 +285,3 @@ checkpoint
 参考链接：
 
 https://github.com/intel-analytics/bigdl-llm-tutorial/tree/main/ch_2_Environment_Setup
-
-https://github.com/intel-analytics/BigDL

From 15be3e6ad4bb7b0414906e0b8f4e899277674289 Mon Sep 17 00:00:00 2001
From: KiwiHana <102839943+KiwiHana@users.noreply.github.com>
Date: Fri, 13 Oct 2023 09:40:39 +0800
Subject: [PATCH 08/15] =?UTF-8?q?Update=20=E5=9F=BA=E4=BA=8EIntel13?=
 =?UTF-8?q?=E4=BB=A3CPU=E7=9A=84=E5=A4=A7=E8=AF=AD=E8=A8=80=E6=A8=A1?=
 =?UTF-8?q?=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91=E6=8C=87=E5=8D=97?=
 =?UTF-8?q?.md?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"
index cbb3c5b..6eabe7d 100644
--- "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"	
+++ "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"	
@@ -10,7 +10,7 @@ https://docs.conda.io/en/latest/miniconda.html#windows-installers
  conda create -n llm python=3.9
  conda activate llm
  pip install --pre --upgrade bigdl-llm[all]
- pip install gradio mdtex2html
+ pip install gradio==3.41.1 mdtex2html
 ```
 或者用指定版本的方式安装
 ```

From 7f35187aebfb790be86097b6452c52eb557556c6 Mon Sep 17 00:00:00 2001
From: KiwiHana <102839943+KiwiHana@users.noreply.github.com>
Date: Wed, 25 Oct 2023 00:15:26 +0800
Subject: [PATCH 09/15] =?UTF-8?q?Update=20=E5=9F=BA=E4=BA=8EIntel13?=
 =?UTF-8?q?=E4=BB=A3CPU=E7=9A=84=E5=A4=A7=E8=AF=AD=E8=A8=80=E6=A8=A1?=
 =?UTF-8?q?=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91=E6=8C=87=E5=8D=97?=
 =?UTF-8?q?.md?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...00\345\217\221\346\214\207\345\215\227.md" | 30 +++++++++++++++++--
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"
index 6eabe7d..fe93e4e 100644
--- "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"	
+++ "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"	
@@ -48,11 +48,34 @@ n_threads=CPU大核数*2+小核数 -2
 
 不同设备可以尝试这3个参数，选择一个最优参数。
 
-对于Xeon，OMP_NUM_THREADS 和 n_threads是第一个socket的物理核数量
+对于Xeon，OMP_NUM_THREADS 和 n_threads是第一个socket的物理核数量。
+对于有2个socket的SPR，需要指定用第一个socket所有物理核进行推理。
+
+对于Ubuntu平台，假设SPR第一个socket有48个物理核。numactl -C 0-47 -m 0 $command
 ```
+sudo apt install numactl
+conda create -n llm python=3.9
+conda activate llm
+pip install bigdl-llm[all]
+pip install bigdl-nano
+source bigdl-nano-init -c
 export OMP_NUM_THREADS=48
-numactl -C 0-47 -m 0 jupyter notebook
+$ numactl -C 0-47 -m 0 llm-cli -t 48 -x chatglm -m "./checkpoint/bigdl_llm_chatglm_q4_0.bin" -p "Once upon
+a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new
+people, and have fun" --no-mmap -v -n 32
+```
+
+对于windows平台，假设SPR第一个socket有48个物理核。start /node 0 $command
+```
+conda create -n llm python=3.9
+conda activate llm
+pip install bigdl-llm[all]
+pip install bigdl-nano
+> start /node 0 llm-cli -t 48 -x chatglm -m "./checkpoint/bigdl_llm_chatglm_q4_0.bin" -p "Once upon
+a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new
+people, and have fun" --no-mmap -v -n 32
 ```
+
 （2）n_ctx=4096表示模型最长的输入+输出文本等于4096 tokens
 ```
 from bigdl.llm.ggml.model.chatglm.chatglm import ChatGLM
@@ -94,8 +117,9 @@ elif model_name == "llama2-13b" or model_name == "StarCoder":
     model = AutoModelForCausalLM.from_pretrained(model_name_local, trust_remote_code=True, load_in_4bit=True)
     model.save_low_bit("D:\\llm-models\\"+model_name)
     tokenizer.save_pretrained("D:\\llm-models\\"+model_name)
-
+```
 python调用transformer INT4模型
+```
 if model_name == "chatglm2-6b":
     model = AutoModel.load_low_bit(model_name_local,trust_remote_code=True)
     tokenizer = AutoTokenizer.from_pretrained(model_name_local,trust_remote_code=True)

From e5e3ab1ec0ab811867373d0144fe01735b653dd5 Mon Sep 17 00:00:00 2001
From: KiwiHana <102839943+KiwiHana@users.noreply.github.com>
Date: Wed, 25 Oct 2023 00:16:35 +0800
Subject: [PATCH 10/15] =?UTF-8?q?Update=20=E5=9F=BA=E4=BA=8EIntel13?=
 =?UTF-8?q?=E4=BB=A3CPU=E7=9A=84=E5=A4=A7=E8=AF=AD=E8=A8=80=E6=A8=A1?=
 =?UTF-8?q?=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91=E6=8C=87=E5=8D=97?=
 =?UTF-8?q?.md?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...5\274\200\345\217\221\346\214\207\345\215\227.md" | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"
index fe93e4e..9f70a1e 100644
--- "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"	
+++ "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"	
@@ -135,12 +135,12 @@ elif model_name == "llama2-13b" or model_name == "StarCoder":
 打开Anaconda PowerShell Prompt
 ```
  conda activate llm
-ChatGLM2: 
- llm-cli -t 20 -x chatglm -m "ggml-chatglm2-6b-q4_0.bin" -p "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" --no-mmap -v -n 32
-Llama2: 
- llm-cli -t 20 -x llama -m "bigdl_llm_llama2_13b_q4_0.bin" -p "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" --no-mmap -n 32
-Starcoder: 
- llm-cli -t 20 -x starcoder -m "bigdl_llm_starcoder_q4_0.bin" -p "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" --no-mmap -n 32
+#ChatGLM2: 
+$ llm-cli -t 20 -x chatglm -m "ggml-chatglm2-6b-q4_0.bin" -p "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" --no-mmap -v -n 32
+#Llama2: 
+$ llm-cli -t 20 -x llama -m "bigdl_llm_llama2_13b_q4_0.bin" -p "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" --no-mmap -n 32
+#Starcoder: 
+$ llm-cli -t 20 -x starcoder -m "bigdl_llm_starcoder_q4_0.bin" -p "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" --no-mmap -n 32
 ```
 参数解释：-n 32限制输出32 tokens。
 

From 6b28198f6eb1c109ff9f52951fe7bf9cbc4a0fbe Mon Sep 17 00:00:00 2001
From: KiwiHana <102839943+KiwiHana@users.noreply.github.com>
Date: Wed, 25 Oct 2023 00:20:19 +0800
Subject: [PATCH 11/15] =?UTF-8?q?Update=20=E5=9F=BA=E4=BA=8EIntel13?=
 =?UTF-8?q?=E4=BB=A3CPU=E7=9A=84=E5=A4=A7=E8=AF=AD=E8=A8=80=E6=A8=A1?=
 =?UTF-8?q?=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91=E6=8C=87=E5=8D=97?=
 =?UTF-8?q?.md?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" | 3 ---
 1 file changed, 3 deletions(-)

diff --git "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"
index 9f70a1e..5809166 100644
--- "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"	
+++ "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"	
@@ -226,9 +226,6 @@ with torch.inference_mode():
     timeCost = time.time() - timeStart
     token_count_input = len(tokenizer.tokenize(prompt))
 
-    token_count_output = int(out[0,2])+1
-    ms_first_token = float(out[0,0])
-    ms_after_token = float(out[0,1]) ## Tensor out的定义在benchmark_util.py L2476
 ```
 
 ## 5 添加history多轮对话功能

From a824a036889ccbdc8eff6194c2f4603a3d84400d Mon Sep 17 00:00:00 2001
From: KiwiHana <102839943+KiwiHana@users.noreply.github.com>
Date: Mon, 13 Nov 2023 17:43:40 +0800
Subject: [PATCH 12/15] =?UTF-8?q?Create=20=E5=9F=BA=E4=BA=8EXeon=E6=88=96S?=
 =?UTF-8?q?PR=E7=9A=84=E5=A4=A7=E8=AF=AD=E8=A8=80=E6=A8=A1=E5=9E=8B?=
 =?UTF-8?q?=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91.md?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...24\347\224\250\345\274\200\345\217\221.md" | 309 ++++++++++++++++++
 1 file changed, 309 insertions(+)
 create mode 100644 "Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\346\210\226SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md"

diff --git "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\346\210\226SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md" "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\346\210\226SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md"
new file mode 100644
index 0000000..1737f1b
--- /dev/null
+++ "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\346\210\226SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md"	
@@ -0,0 +1,309 @@
+# 基于Intel Xeon和SPR的大语言模型应用开发指南
+
+本文档介绍如何开发大语言模型应用UI，基于开源的intel bigdl-llm库和gradio。
+UI跑在windows11 x86 CPU上或者Ubuntu CPU上，实现在6根16GB内存以上运行优化的Native INT4 大语言模型。
+以三个大语言模型为例，ChatGLM2 (6B)中英，LLaMA2 (13B)英。
+
+## 1 安装环境
+（1）Windows11安装Miniconda3-py39_23.5.2-0-Windows-x86_64.exe，下载链接：
+https://docs.conda.io/en/latest/miniconda.html#windows-installers 
+
+如果是Ubuntu,下载安装
+```
+wget https://repo.anaconda.com/miniconda/Miniconda3-py39_23.5.2-0-Linux-x86_64.sh
+chmod -R 777 Miniconda3-py39_23.5.2-0-Linux-x86_64.shy
+./Miniconda3-py39_23.5.2-0-Linux-x86_64.sh
+sudo apt install numactl
+```
+
+（2）打开Anaconda Powershell Prompt窗口
+```
+conda create -n llm python=3.9
+conda activate llm
+pip install --pre --upgrade bigdl-llm[all]
+pip install bigdl-nano
+pip install gradio==3.41.1 mdtex2html
+```
+或者用指定版本的方式安装
+```
+ pip install --pre bigdl-llm[all]==2.4.0b20231110 -i https://pypi.tuna.tsinghua.edu.cn/simple
+```
+## 2 LLM模型转换
+以Chatglm2，llama2，starcoder为例，下载hugging face FP16模型。模型下载链接：
+
+· ChatGLM2-6B：https://huggingface.co/THUDM/chatglm2-6b/tree/main
+
+· Llama2-13B: https://huggingface.co/meta-llama/Llama-2-13b-chat-hf/tree/main
+
+
+### 2.1 FP16转Native INT4模型，并调用python函数 (推荐运行在CPU)
+Chatglm2 ，llama2 转native INT4。
+
+打开Anaconda PowerShell，修改模型路径和输出文件夹名称，并运行：
+```
+ conda activate llm
+ llm-convert "/llm-models/chatglm2-6b/" --model-format pth --model-family "chatglm" --outfile "checkpoint/"
+ llm-convert "/llm-models/llama-2-13b-chat-hf/" --model-format pth --model-family "llama" --outfile "checkpoint/"
+```
+
+
+#### python调用Native INT4模型。
+参数解释：
+
+（1）n_threads
+对于Xeon，OMP_NUM_THREADS 和 n_threads是第一个socket的物理核数量。
+对于有2个socket的SPR，需要指定用第一个socket所有物理核进行推理。
+
+（2）n_ctx=4096表示模型最长的输入+输出文本等于4096 tokens
+```
+from bigdl.llm.ggml.model.chatglm.chatglm import ChatGLM
+from bigdl.llm.transformers import BigdlNativeForCausalLM
+model_name = "chatglm2-6b"
+model_all_local_path = "C:\\PC_LLM\\checkpoint\\"
+if model_name == "chatglm2-6b":
+    model = ChatGLM(model_all_local_path + "\\ggml-chatglm2-6b-q4_0.bin", n_threads=20,n_ctx=4096) 
+
+elif model_name == "llama2-13b":
+    model = BigdlNativeForCausalLM.from_pretrained(
+    pretrained_model_name_or_path=model_all_local_path + "\\bigdl_llm_llama2_13b_q4_0.bin",
+    model_family='llama',n_threads=20,n_ctx=4096)
+```
+### 2.2 FP16转transformer INT4，并调用python函数
+Transformer INT4在CPU上运行性能比Native INT4低一些。
+
+用python脚本转换模型为transformer INT4
+```
+from bigdl.llm.transformers import AutoModel
+from transformers import AutoTokenizer
+from bigdl.llm.transformers import AutoModelForCausalLM
+model_name = "chatglm2-6b"
+model_all_local_path = "C:\\PC_LLM\\checkpoint\\"
+model_name_local = model_all_local_path + model_name
+
+if model_name == "chatglm2-6b":
+    tokenizer = AutoTokenizer.from_pretrained(model_name_local, trust_remote_code=True)
+    model = AutoModel.from_pretrained(model_name_local, trust_remote_code=True, load_in_4bit=True)
+    model.save_low_bit("D:\\llm-models\\chatglm2-6b-int4\\")
+    tokenizer.save_pretrained("D:\\llm-models\\chatglm2-6b-int4\\")
+
+elif model_name == "llama2-13b":
+    tokenizer = AutoTokenizer.from_pretrained(model_name_local, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(model_name_local, trust_remote_code=True, load_in_4bit=True)
+    model.save_low_bit("D:\\llm-models\\"+model_name)
+    tokenizer.save_pretrained("D:\\llm-models\\"+model_name)
+```
+python调用transformer INT4模型
+```
+if model_name == "chatglm2-6b":
+    model = AutoModel.load_low_bit(model_name_local,trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_name_local,trust_remote_code=True)
+    model = model.eval()
+elif model_name == "llama2-13b":
+    model = AutoModelForCausalLM.load_low_bit(model_name_local,trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_name_local,trust_remote_code=True)
+    model = model.eval()
+```
+## 3 测试LLM benchmark on CPU
+使用Native INT4模型测试LLM benchmark on CPU将会使用所有核，方便和应用UI的性能指标相比较。
+
+对于Ubuntu平台，假设SPR第一个socket有48个物理核。numactl -C 0-47 -m 0 $command
+```
+$ lscpu
+NUMA node0 CPU(s): 0-47,96-143
+NUMA node1 CPU(s): 48-95,144-191
+
+Therefore, you will set parameters like:
+$ export OMP_NUM_THREADS=48
+$ numactl -C 0-47 -m 0 llm-cli -t 48 ……
+```
+
+```
+sudo apt install numactl
+conda create -n llm python=3.9
+conda activate llm
+pip install bigdl-llm[all]
+pip install bigdl-nano
+source bigdl-nano-init -c
+export OMP_NUM_THREADS=48
+export TRANSFORMERS_OFFLINE=1
+$ numactl -C 0-47 -m 0 llm-cli -t 48 -x chatglm -m "./checkpoint/bigdl_llm_chatglm_q4_0.bin" -p "Once upon
+a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new
+people, and have fun" --no-mmap -v -n 32
+
+numactl -C 0-47 -m 0 llm-cli -t 48 -x llama -m "bigdl_llm_llama2_13b_q4_0.bin" -p "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" --no-mmap -n 32
+```
+
+对于windows平台，假设SPR第一个socket有48个物理核。start /node 0 $command
+```
+conda create -n llm python=3.9
+conda activate llm
+pip install bigdl-llm[all]
+pip install bigdl-nano
+
+start /node 0 llm-cli -t 48 -x chatglm -m "./checkpoint/bigdl_llm_chatglm_q4_0.bin" -p "Once upon
+a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new
+people, and have fun" --no-mmap -v -n 32
+```
+
+参数解释：-n 32限制输出32 tokens。
+
+从command line提取性能信息，如图：
+
+Input token: 32 tokens 
+
+Output token: 32 tokens (31 runs = 32 tokens – 1st token)
+
+1st token avg latency (ms) = 1541.56 ms 
+
+2nd+ token avg latency (ms/token) = 125.62 ms per token
+
+图1：llm-cli的输出
+
+## 4 用吐字的方式输出文本
+### 4.1（推荐运行在CPU）：Native int4 for chatglm2，llama2和starcoder。
+```
+from bigdl.llm.ggml.model.chatglm.chatglm import ChatGLM
+model_name = "chatglm2-6b"
+model_all_local_path = "C:\\PC_LLM\\checkpoint\\"
+prompt = "What is AI?"
+if model_name == "chatglm2-6b":
+    model = ChatGLM(model_all_local_path + "\\ggml-chatglm2-6b-q4_0.bin", n_threads=20,n_ctx=4096)
+    response = ""
+    for chunk in model(prompt, temperature=0.95,top_p=0.8,stream=True,max_tokens=512):
+        response += chunk['choices'][0]['text']
+```
+llama2和starcoder的吐字调用方式相同，也是用for循环。
+
+参数说明：
+
+· 温度（Temperature）（数值越高，输出的随机性增加），可调范围0~1
+
+· Top P（数值越高，词语选择的多样性增加），可调范围0~1
+
+· 输出最大长度（Max Length）（输出文本的最大tokens），可调范围0~2048，上限由模型决定。这三个模型n_ctx最大8k，输入+输出tokens应小于8k。
+
+### 4.2 Transformer INT4 stream_chat仅限chatglm2
+```
+from bigdl.llm.transformers import AutoModel
+from transformers import AutoTokenizer
+import torch
+model_name = "chatglm2-6b"
+model_all_local_path = "C:\\PC_LLM\\checkpoint\\"
+model_name_local = model_all_local_path + model_name
+prompt = "What is AI?"
+model = AutoModel.load_low_bit(model_name_local,trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained(model_name_local,trust_remote_code=True)
+model = model.eval()
+with torch.inference_mode():
+    for response, history in model.stream_chat(tokenizer, prompt, history,max_length=512, top_p=0.9,temperature=0.9):
+        print(response)
+```
+### 4.3 Transformer INT4 TextIteratorStreamer for chatglm2，llama2和starcoder。
+```
+from bigdl.llm.transformers import AutoModel
+from transformers import AutoTokenizer,TextIteratorStreamer
+import torch
+from benchmark_util import BenchmarkWrapper
+
+model_name = "chatglm2-6b"
+model_all_local_path = "C:\\PC_LLM\\checkpoint\\"
+model_name_local = model_all_local_path + model_name
+model = AutoModel.load_low_bit(model_name_local,trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained(model_name_local,trust_remote_code=True)
+model = model.eval()
+prompt = "What is AI?"
+with torch.inference_mode():
+    model=BenchmarkWrapper(model)
+    inputs = tokenizer(prompt, return_tensors="pt")
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    response = ""
+    timeStart = time.time() 
+  #  out = model.generate(**inputs, streamer=streamer, temperature=0.9, top_p=0.9, max_new_tokens=512) 
+    generate_kwargs = dict(**inputs,streamer=streamer,temperature=0.9, top_p=0.9, max_new_tokens=512)
+    from threading import Thread
+    thread = Thread(target=model.generate, kwargs=generate_kwargs)
+    thread.start()
+
+    for new_text in streamer:
+        response += new_text
+    timeCost = time.time() - timeStart
+    token_count_input = len(tokenizer.tokenize(prompt))
+
+```
+
+## 5 添加history多轮对话功能
+### 5.1 仅对chatglm2 Transformer INT4 stream_chat
+代码参考4.2
+### 5.2对于Native int4添加history多轮对话功能
+```
+from bigdl.llm.ggml.model.chatglm.chatglm import ChatGLM
+model_name = "chatglm2-6b"
+model_all_local_path = "C:\\PC_LLM\\checkpoint\\"
+history_round = 0
+history = []
+if model_name == "chatglm2-6b":
+    model = ChatGLM(model_all_local_path + "\\ggml-chatglm2-6b-q4_0.bin", n_threads=20,n_ctx=4096)
+input = "你好"
+predict(input)
+input = "请进行丽江三天必游景点旅游规划"
+predict(input)
+
+def predict(input):
+    global history_round, model, history
+    response = ""
+    if len(model.tokenize(history)) > 2500 or history_round >= 5: ### history record 5 rounds
+        history_round = 0
+        history = [] 
+        print("*********** reset chatbot and history", history)
+
+    if len(history) == 0:
+        print("*********** new chat ")
+        prompt = input
+        history = prompt
+        history_round = 1
+    else:
+        prompt = history + '\n' + input
+        history_round += 1
+    print("******************* history_round ", history_round)
+
+    timeStart = time.time()
+    for chunk in model(prompt, temperature=0.9,top_p=0.9, stream=True,max_tokens=512):
+        response += chunk['choices'][0]['text']
+    history = prompt + response
+    print("******** max_length history",len(model.tokenize(history)))
+```
+### 5.3 对于transformer INT4 TextIteratorStreamer同5.2 
+## 6 用gradio写Web UI
+下载代码：https://github.com/KiwiHana/LLM_UI_Windows_CPU
+
+![image](https://github.com/KiwiHana/bigdl-llm-tutorial/assets/102839943/5a399c7e-31b4-4337-a6a4-bc6f8bccb93c)
+图2：LLM_UI_Windows_CPU界面
+
+
+为了使用全部核，用管理员打开Anaconda Powershell Prompt窗口，运行LLM_demo_v1.0.py 或 LLM_demo_v2.0.py。
+```
+git clone https://github.com/KiwiHana/LLM_UI_Windows_CPU.git
+cd LLM_UI_Windows_CPU
+conda activate llm
+python LLM_demo_v1.0.py
+```
+Note: 修改LLM_demo_v1.0.py脚本第285行 main函数里的模型存放路径，
+
+例如 model_all_local_path = "C:/Users/username/checkpoint/"
+
+· 大语言模型应用UIv1.0文件夹应包含：
+
+LLM_demo_v1.0.py
+
+theme3.json
+
+checkpoint
+
+-- bigdl_llm_llama2_13b_q4_0.bin
+
+-- ggml-chatglm2-6b-q4_0.bin
+
+
+参考链接：
+
+https://github.com/intel-analytics/bigdl-llm-tutorial/tree/main/ch_2_Environment_Setup

From 04597b9c4db8ad0e2be6d0838a22042642345818 Mon Sep 17 00:00:00 2001
From: KiwiHana <102839943+KiwiHana@users.noreply.github.com>
Date: Mon, 13 Nov 2023 17:44:01 +0800
Subject: [PATCH 13/15] =?UTF-8?q?Rename=20=E5=9F=BA=E4=BA=8EXeon=E6=88=96S?=
 =?UTF-8?q?PR=E7=9A=84=E5=A4=A7=E8=AF=AD=E8=A8=80=E6=A8=A1=E5=9E=8B?=
 =?UTF-8?q?=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91.md=20to=20=E5=9F=BA?=
 =?UTF-8?q?=E4=BA=8EXeon=E5=92=8CSPR=E7=9A=84=E5=A4=A7=E8=AF=AD=E8=A8=80?=
 =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91.md?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...45\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md" | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename "Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\346\210\226SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md" => "Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\345\222\214SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md" (100%)

diff --git "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\346\210\226SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md" "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\345\222\214SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md"
similarity index 100%
rename from "Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\346\210\226SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md"
rename to "Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\345\222\214SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md"

From 61f4cf670366cc9da8f79b16e0c3bc6988e84614 Mon Sep 17 00:00:00 2001
From: KiwiHana <102839943+KiwiHana@users.noreply.github.com>
Date: Mon, 13 Nov 2023 17:50:34 +0800
Subject: [PATCH 14/15] =?UTF-8?q?Update=20=E5=9F=BA=E4=BA=8EXeon=E5=92=8CS?=
 =?UTF-8?q?PR=E7=9A=84=E5=A4=A7=E8=AF=AD=E8=A8=80=E6=A8=A1=E5=9E=8B?=
 =?UTF-8?q?=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91.md?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...\213\345\272\224\347\224\250\345\274\200\345\217\221.md" | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\345\222\214SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md" "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\345\222\214SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md"
index 1737f1b..fdd1bdc 100644
--- "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\345\222\214SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md"	
+++ "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\345\222\214SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md"	
@@ -4,6 +4,10 @@
 UI跑在windows11 x86 CPU上或者Ubuntu CPU上，实现在6根16GB内存以上运行优化的Native INT4 大语言模型。
 以三个大语言模型为例，ChatGLM2 (6B)中英，LLaMA2 (13B)英。
 
+Note: 通常客户服务器出货系统，CPU的内存槽位不会插满，这种情况下必须要根据实际条数插到特定的槽位，否则会影响到系统稳定性与性能。以下是Eagle Stream内存插法示意图。
+![image](https://github.com/KiwiHana/bigdl-llm-tutorial/assets/102839943/a54c74cc-6581-4f9e-b2b4-3780bbcfe2a6)
+
+
 ## 1 安装环境
 （1）Windows11安装Miniconda3-py39_23.5.2-0-Windows-x86_64.exe，下载链接：
 https://docs.conda.io/en/latest/miniconda.html#windows-installers 
@@ -158,6 +162,8 @@ Output token: 32 tokens (31 runs = 32 tokens – 1st token)
 2nd+ token avg latency (ms/token) = 125.62 ms per token
 
 图1：llm-cli的输出
+![image](https://github.com/KiwiHana/bigdl-llm-tutorial/assets/102839943/5adf144a-5fc5-432f-b476-f26d341fbced)
+
 
 ## 4 用吐字的方式输出文本
 ### 4.1（推荐运行在CPU）：Native int4 for chatglm2，llama2和starcoder。

From ed5a6215817424f4159f6ef5dcf06459a6e8cb0f Mon Sep 17 00:00:00 2001
From: KiwiHana <102839943+KiwiHana@users.noreply.github.com>
Date: Mon, 13 Nov 2023 17:57:40 +0800
Subject: [PATCH 15/15] =?UTF-8?q?Update=20=E5=9F=BA=E4=BA=8EIntel13?=
 =?UTF-8?q?=E4=BB=A3CPU=E7=9A=84=E5=A4=A7=E8=AF=AD=E8=A8=80=E6=A8=A1?=
 =?UTF-8?q?=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91=E6=8C=87=E5=8D=97?=
 =?UTF-8?q?.md?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...00\345\217\221\346\214\207\345\215\227.md" | 28 -------------------
 1 file changed, 28 deletions(-)

diff --git "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"
index 5809166..9537f8a 100644
--- "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"	
+++ "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md"	
@@ -48,34 +48,6 @@ n_threads=CPU大核数*2+小核数 -2
 
 不同设备可以尝试这3个参数，选择一个最优参数。
 
-对于Xeon，OMP_NUM_THREADS 和 n_threads是第一个socket的物理核数量。
-对于有2个socket的SPR，需要指定用第一个socket所有物理核进行推理。
-
-对于Ubuntu平台，假设SPR第一个socket有48个物理核。numactl -C 0-47 -m 0 $command
-```
-sudo apt install numactl
-conda create -n llm python=3.9
-conda activate llm
-pip install bigdl-llm[all]
-pip install bigdl-nano
-source bigdl-nano-init -c
-export OMP_NUM_THREADS=48
-$ numactl -C 0-47 -m 0 llm-cli -t 48 -x chatglm -m "./checkpoint/bigdl_llm_chatglm_q4_0.bin" -p "Once upon
-a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new
-people, and have fun" --no-mmap -v -n 32
-```
-
-对于windows平台，假设SPR第一个socket有48个物理核。start /node 0 $command
-```
-conda create -n llm python=3.9
-conda activate llm
-pip install bigdl-llm[all]
-pip install bigdl-nano
-> start /node 0 llm-cli -t 48 -x chatglm -m "./checkpoint/bigdl_llm_chatglm_q4_0.bin" -p "Once upon
-a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new
-people, and have fun" --no-mmap -v -n 32
-```
-
 （2）n_ctx=4096表示模型最长的输入+输出文本等于4096 tokens
 ```
 from bigdl.llm.ggml.model.chatglm.chatglm import ChatGLM