From 82a1e21f5c26231c82c86d3885b7766ad5e12f6c Mon Sep 17 00:00:00 2001 From: KiwiHana <102839943+KiwiHana@users.noreply.github.com> Date: Wed, 6 Sep 2023 08:19:35 +0800 Subject: [PATCH 01/15] =?UTF-8?q?Create=20=E5=A4=A7=E8=AF=AD=E8=A8=80?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91=E6=8C=87?= =?UTF-8?q?=E5=8D=97.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...00\345\217\221\346\214\207\345\215\227.md" | 294 ++++++++++++++++++ 1 file changed, 294 insertions(+) create mode 100644 "\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" diff --git "a/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" new file mode 100644 index 0000000..499b122 --- /dev/null +++ "b/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" @@ -0,0 +1,294 @@ +# 基于Intel 13代CPU的大语言模型应用开发指南 + +本文档介绍如何开发大语言模型应用UI,基于开源的intel bigdl-llm库和gradio。UI跑在windows11 x86 CPU上,实现在PC 16GB内存上运行优化的Native INT4 大语言模型。以三个大语言模型为例,ChatGLM2 (6B)中英,LLaMA2 (13B)英,StarCoder (15.5B)中英。 +## 1 安装环境 +(1)Windows11安装Miniconda3-py39_23.5.2-0-Windows-x86_64.exe,下载链接: +https://docs.conda.io/en/latest/miniconda.html#windows-installers + +(2)打开Anaconda Powershell Prompt窗口 +``` + conda create -n llm python=3.9 + conda activate llm + pip install --pre --upgrade bigdl-llm[all] + pip install gradio mdtex2html +``` +或者用指定版本的方式安装 +``` + pip install --pre bigdl-llm[all]==2.4.0b20230820 -i https://pypi.tuna.tsinghua.edu.cn/simple +``` +## 2 LLM模型转换 +以Chatglm2,llama2,starcoder为例,下载hugging face FP16模型。模型下载链接: + +· ChatGLM2-6B:https://huggingface.co/THUDM/chatglm2-6b/tree/main + +· Llama2-13B: https://huggingface.co/meta-llama/Llama-2-13b-chat-hf/tree/main + +· StarCoder: https://huggingface.co/bigcode/starcoder/tree/main + +### 2.1 FP16转Native INT4模型,并调用python函数 (推荐运行在CPU) +Chatglm2 ,llama2,starcoder转native INT4。 + +打开Anaconda PowerShell,修改模型路径和输出文件夹名称,并运行: +``` + conda activate llm + llm-convert "C:/llm-models/chatglm2-6b/" --model-format pth --model-family "chatglm" --outfile "checkpoint/" + llm-convert "C:/llm-models/llama-2-13b-chat-hf/" --model-format pth --model-family "llama" --outfile "checkpoint/" + llm-convert "C:/llm-models/starcoder/" --model-format pth --model-family "starcoder" --outfile "checkpoint/" +``` +Note:starcoder用16GB内存的机器转不了Native INT4,因为内存不够。建议转starcoder native INT4用更大的内存的机器。 + +#### python调用Native INT4模型。 +参数解释: + +(1)n_threads=CPU大核数*2+小核数 或者 + +n_threads=CPU大核数*2+小核数 - 1 或者 + +n_threads=CPU大核数*2+小核数 -2 + +不同设备可以尝试这3个参数,选择一个最优参数。 + +对于Xeon,OMP_NUM_THREADS 和 n_threads是第一个socket的物理核数量 +``` +export OMP_NUM_THREADS=48 +numactl -C 0-47 -m 0 jupyter notebook +``` +(2)n_ctx=4096表示模型最长的输入+输出文本等于4096 tokens +``` +from bigdl.llm.ggml.model.chatglm.chatglm import ChatGLM +from bigdl.llm.transformers import BigdlNativeForCausalLM +model_name = "chatglm2-6b" +model_all_local_path = "C:\\PC_LLM\\checkpoint\\" +if model_name == "chatglm2-6b": + model = ChatGLM(model_all_local_path + "\\ggml-chatglm2-6b-q4_0.bin", n_threads=20,n_ctx=4096) + +elif model_name == "llama2-13b": + model = BigdlNativeForCausalLM.from_pretrained( + pretrained_model_name_or_path=model_all_local_path + "\\bigdl_llm_llama2_13b_q4_0.bin", + model_family='llama',n_threads=20,n_ctx=4096) +elif model_name == "StarCoder": + model = BigdlNativeForCausalLM.from_pretrained( + pretrained_model_name_or_path=model_all_local_path + "\\bigdl_llm_starcoder_q4_0.bin", + model_family='starcoder',n_threads=20,n_ctx=4096) +``` +### 2.2 FP16转transformer INT4,并调用python函数 +Transformer INT4在CPU上运行性能比Native INT4低一些。 + +用python脚本转换模型为transformer INT4 +``` +from bigdl.llm.transformers import AutoModel +from transformers import AutoTokenizer +from bigdl.llm.transformers import AutoModelForCausalLM +model_name = "chatglm2-6b" +model_all_local_path = "C:\\PC_LLM\\checkpoint\\" +model_name_local = model_all_local_path + model_name + +if model_name == "chatglm2-6b": + tokenizer = AutoTokenizer.from_pretrained(model_name_local, trust_remote_code=True) + model = AutoModel.from_pretrained(model_name_local, trust_remote_code=True, load_in_4bit=True) + model.save_low_bit("D:\\llm-models\\chatglm2-6b-int4\\") + tokenizer.save_pretrained("D:\\llm-models\\chatglm2-6b-int4\\") + +elif model_name == "llama2-13b" or model_name == "StarCoder": + tokenizer = AutoTokenizer.from_pretrained(model_name_local, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name_local, trust_remote_code=True, load_in_4bit=True) + model.save_low_bit("D:\\llm-models\\"+model_name) + tokenizer.save_pretrained("D:\\llm-models\\"+model_name) + +python调用transformer INT4模型 +if model_name == "chatglm2-6b": + model = AutoModel.load_low_bit(model_name_local,trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_name_local,trust_remote_code=True) + model = model.eval() +elif model_name == "llama2-13b" or model_name == "StarCoder": + model = AutoModelForCausalLM.load_low_bit(model_name_local,trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_name_local,trust_remote_code=True) + model = model.eval() +``` +## 3 测试LLM benchmark on CPU +使用Native INT4模型测试LLM benchmark on CPU将会使用所有核,方便和应用UI的性能指标相比较。 + +打开Anaconda PowerShell Prompt +``` + conda activate llm +ChatGLM2: + llm-cli -t 20 -x chatglm -m "ggml-chatglm2-6b-q4_0.bin" -p "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" --no-mmap -v -n 32 +Llama2: + llm-cli -t 20 -x llama -m "bigdl_llm_llama2_13b_q4_0.bin" -p "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" --no-mmap -n 32 +Starcoder: + llm-cli -t 20 -x starcoder -m "bigdl_llm_starcoder_q4_0.bin" -p "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" --no-mmap -n 32 +``` +参数解释:-n 32限制输出32 tokens。 + +从command line提取性能信息,如图: + +Input token: 32 tokens + +Output token: 32 tokens (31 runs = 32 tokens – 1st token) + +1st token avg latency (ms) = 1541.56 ms + +2nd+ token avg latency (ms/token) = 125.62 ms per token + +图1:llm-cli的输出 + +## 4 用吐字的方式输出文本 +### 4.1(推荐运行在CPU):Native int4 for chatglm2,llama2和starcoder。 +``` +from bigdl.llm.ggml.model.chatglm.chatglm import ChatGLM +model_name = "chatglm2-6b" +model_all_local_path = "C:\\PC_LLM\\checkpoint\\" +prompt = "What is AI?" +if model_name == "chatglm2-6b": + model = ChatGLM(model_all_local_path + "\\ggml-chatglm2-6b-q4_0.bin", n_threads=20,n_ctx=4096) + response = "" + for chunk in model(prompt, temperature=0.95,top_p=0.8,stream=True,max_tokens=512): + response += chunk['choices'][0]['text'] +``` +llama2和starcoder的吐字调用方式相同,也是用for循环。 + +参数说明: + +· 温度(Temperature)(数值越高,输出的随机性增加),可调范围0~1 + +· Top P(数值越高,词语选择的多样性增加),可调范围0~1 + +· 输出最大长度(Max Length)(输出文本的最大tokens),可调范围0~2048,上限由模型决定。这三个模型n_ctx最大8k,输入+输出tokens应小于8k。 + +### 4.2 Transformer INT4 stream_chat仅限chatglm2 +``` +from bigdl.llm.transformers import AutoModel +from transformers import AutoTokenizer +import torch +model_name = "chatglm2-6b" +model_all_local_path = "C:\\PC_LLM\\checkpoint\\" +model_name_local = model_all_local_path + model_name +prompt = "What is AI?" +model = AutoModel.load_low_bit(model_name_local,trust_remote_code=True) +tokenizer = AutoTokenizer.from_pretrained(model_name_local,trust_remote_code=True) +model = model.eval() +with torch.inference_mode(): + for response, history in model.stream_chat(tokenizer, prompt, history,max_length=512, top_p=0.9,temperature=0.9): + print(response) +``` +### 4.3 Transformer INT4 TextIteratorStreamer for chatglm2,llama2和starcoder。 +``` +from bigdl.llm.transformers import AutoModel +from transformers import AutoTokenizer,TextIteratorStreamer +import torch +from benchmark_util import BenchmarkWrapper + +model_name = "chatglm2-6b" +model_all_local_path = "C:\\PC_LLM\\checkpoint\\" +model_name_local = model_all_local_path + model_name +model = AutoModel.load_low_bit(model_name_local,trust_remote_code=True) +tokenizer = AutoTokenizer.from_pretrained(model_name_local,trust_remote_code=True) +model = model.eval() +prompt = "What is AI?" +with torch.inference_mode(): + model=BenchmarkWrapper(model) + inputs = tokenizer(prompt, return_tensors="pt") + streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) + response = "" + timeStart = time.time() + # out = model.generate(**inputs, streamer=streamer, temperature=0.9, top_p=0.9, max_new_tokens=512) + generate_kwargs = dict(**inputs,streamer=streamer,temperature=0.9, top_p=0.9, max_new_tokens=512) + from threading import Thread + thread = Thread(target=model.generate, kwargs=generate_kwargs) + thread.start() + + for new_text in streamer: + response += new_text + timeCost = time.time() - timeStart + token_count_input = len(tokenizer.tokenize(prompt)) + + token_count_output = int(out[0,2])+1 + ms_first_token = float(out[0,0]) + ms_after_token = float(out[0,1]) ## Tensor out的定义在benchmark_util.py L2476 +``` + +## 5 添加history多轮对话功能 +### 5.1 仅对chatglm2 Transformer INT4 stream_chat +代码参考4.2 +### 5.2对于Native int4添加history多轮对话功能 +``` +from bigdl.llm.ggml.model.chatglm.chatglm import ChatGLM +model_name = "chatglm2-6b" +model_all_local_path = "C:\\PC_LLM\\checkpoint\\" +history_round = 0 +history = [] +if model_name == "chatglm2-6b": + model = ChatGLM(model_all_local_path + "\\ggml-chatglm2-6b-q4_0.bin", n_threads=20,n_ctx=4096) +input = "你好" +predict(input) +input = "请进行丽江三天必游景点旅游规划" +predict(input) + +def predict(input): + global history_round, model, history + response = "" + if len(model.tokenize(history)) > 2500 or history_round >= 5: ### history record 5 rounds + history_round = 0 + history = [] + print("*********** reset chatbot and history", history) + + if len(history) == 0: + print("*********** new chat ") + prompt = input + history = prompt + history_round = 1 + else: + prompt = history + '\n' + input + history_round += 1 + print("******************* history_round ", history_round) + + timeStart = time.time() + for chunk in model(prompt, temperature=0.9,top_p=0.9, stream=True,max_tokens=512): + response += chunk['choices'][0]['text'] + history = prompt + response + print("******** max_length history",len(model.tokenize(history))) +``` +### 5.3 对于transformer INT4 TextIteratorStreamer同5.2 +## 6 用gradio写Web UI +gradio模板风格库:https://huggingface.co/spaces/gradio/theme-gallery + +下载代码:https://github.com/KiwiHana/LLM_UI_Windows_CPU + +为了使用全部核,用管理员打开Anaconda Powershell Prompt窗口,运行LLM_demo_v1.0.py 或 LLM_demo_v2.0.py。 +``` +git clone https://github.com/KiwiHana/LLM_UI_Windows_CPU.git +cd LLM_UI_Windows_CPU +conda activate llm +python LLM_demo_v1.0.py +``` +Note: 修改LLM_demo_v1.0.py脚本第285行 main函数里的模型存放路径, + +例如 model_all_local_path = "C:/Users/username/checkpoint/" + +· 大语言模型应用UIv1.0文件夹应包含: + +LLM_demo_v1.0.py + +theme3.json + +checkpoint + +-- bigdl_llm_llama2_13b_q4_0.bin + +-- bigdl_llm_starcoder_q4_0.bin + +-- ggml-chatglm2-6b-q4_0.bin + +修改Run_Intel_LLM_Demo.bat里环境名称如llm,代码路径。为了使用全部核,用管理员权限打开Run_Intel_LLM_Demo.bat +``` +D: +cd D:\PC_LLM_UI\ +call C:\Users\LLM\miniconda3\Scripts\activate.bat C:\Users\LLM\miniconda3 +call conda activate llm +start python LLM_demo_v1.0.py +``` +参考链接: + +https://github.com/intel-analytics/bigdl-llm-tutorial/tree/main/ch_2_Environment_Setup + +https://github.com/intel-analytics/BigDL From 624b85537e24cd5ec11eb16952f797cf37baabfd Mon Sep 17 00:00:00 2001 From: KiwiHana <102839943+KiwiHana@users.noreply.github.com> Date: Wed, 6 Sep 2023 08:28:03 +0800 Subject: [PATCH 02/15] =?UTF-8?q?Update=20=E5=A4=A7=E8=AF=AD=E8=A8=80?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91=E6=8C=87?= =?UTF-8?q?=E5=8D=97.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...74\200\345\217\221\346\214\207\345\215\227.md" | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git "a/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" index 499b122..db002af 100644 --- "a/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" +++ "b/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" @@ -250,10 +250,12 @@ def predict(input): ``` ### 5.3 对于transformer INT4 TextIteratorStreamer同5.2 ## 6 用gradio写Web UI -gradio模板风格库:https://huggingface.co/spaces/gradio/theme-gallery - 下载代码:https://github.com/KiwiHana/LLM_UI_Windows_CPU +![image](https://github.com/KiwiHana/bigdl-llm-tutorial/assets/102839943/5a399c7e-31b4-4337-a6a4-bc6f8bccb93c) +图2:LLM_UI_Windows_CPU界面 + + 为了使用全部核,用管理员打开Anaconda Powershell Prompt窗口,运行LLM_demo_v1.0.py 或 LLM_demo_v2.0.py。 ``` git clone https://github.com/KiwiHana/LLM_UI_Windows_CPU.git @@ -279,14 +281,7 @@ checkpoint -- ggml-chatglm2-6b-q4_0.bin -修改Run_Intel_LLM_Demo.bat里环境名称如llm,代码路径。为了使用全部核,用管理员权限打开Run_Intel_LLM_Demo.bat -``` -D: -cd D:\PC_LLM_UI\ -call C:\Users\LLM\miniconda3\Scripts\activate.bat C:\Users\LLM\miniconda3 -call conda activate llm -start python LLM_demo_v1.0.py -``` + 参考链接: https://github.com/intel-analytics/bigdl-llm-tutorial/tree/main/ch_2_Environment_Setup From d3817124a90be845d685938c6081d5e0bd055e2e Mon Sep 17 00:00:00 2001 From: KiwiHana <102839943+KiwiHana@users.noreply.github.com> Date: Wed, 6 Sep 2023 08:31:28 +0800 Subject: [PATCH 03/15] =?UTF-8?q?Update=20=E5=A4=A7=E8=AF=AD=E8=A8=80?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91=E6=8C=87?= =?UTF-8?q?=E5=8D=97.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From 89e62e7e59341aa5b917d5065a9f055ddfa872d0 Mon Sep 17 00:00:00 2001 From: KiwiHana <102839943+KiwiHana@users.noreply.github.com> Date: Fri, 8 Sep 2023 09:34:56 +0800 Subject: [PATCH 04/15] =?UTF-8?q?Rename=20=E5=A4=A7=E8=AF=AD=E8=A8=80?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91=E6=8C=87?= =?UTF-8?q?=E5=8D=97.md=20to=20Chinese=5FVersion=20/ch=5F8=5FApplications?= =?UTF-8?q?=E5=A4=A7Chinese=5FVersion=20/ch=5F8=5FApplications=E8=AF=AD?= =?UTF-8?q?=E8=A8=80=E6=A8=A1=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91?= =?UTF-8?q?=E6=8C=87=E5=8D=97.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...47\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename "\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" => "Chinese_Version /ch_8_Applications\345\244\247Chinese_Version /ch_8_Applications\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" (100%) diff --git "a/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/Chinese_Version /ch_8_Applications\345\244\247Chinese_Version /ch_8_Applications\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" similarity index 100% rename from "\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" rename to "Chinese_Version /ch_8_Applications\345\244\247Chinese_Version /ch_8_Applications\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" From b636b7d85986aaa0b372ba84791c36b928d783b9 Mon Sep 17 00:00:00 2001 From: KiwiHana <102839943+KiwiHana@users.noreply.github.com> Date: Fri, 8 Sep 2023 09:35:44 +0800 Subject: [PATCH 05/15] =?UTF-8?q?Rename=20ch=5F8=5FApplications=E8=AF=AD?= =?UTF-8?q?=E8=A8=80=E6=A8=A1=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91?= =?UTF-8?q?=E6=8C=87=E5=8D=97.md=20to=20=E5=A4=A7=E8=AF=AD=E8=A8=80?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91=E6=8C=87?= =?UTF-8?q?=E5=8D=97.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...47\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename "Chinese_Version /ch_8_Applications\345\244\247Chinese_Version /ch_8_Applications\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" => "Chinese_Version /ch_8_Applications/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" (100%) diff --git "a/Chinese_Version /ch_8_Applications\345\244\247Chinese_Version /ch_8_Applications\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/Chinese_Version /ch_8_Applications/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" similarity index 100% rename from "Chinese_Version /ch_8_Applications\345\244\247Chinese_Version /ch_8_Applications\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" rename to "Chinese_Version /ch_8_Applications/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" From 48b85bab50f8b2d7f872d3e3cfbb6e96e32040d2 Mon Sep 17 00:00:00 2001 From: KiwiHana <102839943+KiwiHana@users.noreply.github.com> Date: Fri, 8 Sep 2023 09:36:30 +0800 Subject: [PATCH 06/15] =?UTF-8?q?Rename=20=E5=A4=A7=E8=AF=AD=E8=A8=80?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91=E6=8C=87?= =?UTF-8?q?=E5=8D=97.md=20to=20=E5=9F=BA=E4=BA=8EIntel=2013=E4=BB=A3CPU?= =?UTF-8?q?=E7=9A=84=E5=A4=A7=E8=AF=AD=E8=A8=80=E6=A8=A1=E5=9E=8B=E5=BA=94?= =?UTF-8?q?=E7=94=A8=E5=BC=80=E5=8F=91=E6=8C=87=E5=8D=97.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...47\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename "Chinese_Version /ch_8_Applications/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" => "Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel 13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" (100%) diff --git "a/Chinese_Version /ch_8_Applications/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel 13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" similarity index 100% rename from "Chinese_Version /ch_8_Applications/\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" rename to "Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel 13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" From 7ffb334b9896484c0552a3eaae026b2b7a94b2fb Mon Sep 17 00:00:00 2001 From: KiwiHana <102839943+KiwiHana@users.noreply.github.com> Date: Fri, 8 Sep 2023 09:38:47 +0800 Subject: [PATCH 07/15] =?UTF-8?q?Update=20and=20rename=20=E5=9F=BA?= =?UTF-8?q?=E4=BA=8EIntel=2013=E4=BB=A3CPU=E7=9A=84=E5=A4=A7=E8=AF=AD?= =?UTF-8?q?=E8=A8=80=E6=A8=A1=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91?= =?UTF-8?q?=E6=8C=87=E5=8D=97.md=20to=20=E5=9F=BA=E4=BA=8EIntel13=E4=BB=A3?= =?UTF-8?q?CPU=E7=9A=84=E5=A4=A7=E8=AF=AD=E8=A8=80=E6=A8=A1=E5=9E=8B?= =?UTF-8?q?=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91=E6=8C=87=E5=8D=97.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" | 2 -- 1 file changed, 2 deletions(-) rename "Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel 13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" => "Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" (99%) diff --git "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel 13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" similarity index 99% rename from "Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel 13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" rename to "Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" index db002af..cbb3c5b 100644 --- "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel 13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" +++ "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" @@ -285,5 +285,3 @@ checkpoint 参考链接: https://github.com/intel-analytics/bigdl-llm-tutorial/tree/main/ch_2_Environment_Setup - -https://github.com/intel-analytics/BigDL From 15be3e6ad4bb7b0414906e0b8f4e899277674289 Mon Sep 17 00:00:00 2001 From: KiwiHana <102839943+KiwiHana@users.noreply.github.com> Date: Fri, 13 Oct 2023 09:40:39 +0800 Subject: [PATCH 08/15] =?UTF-8?q?Update=20=E5=9F=BA=E4=BA=8EIntel13?= =?UTF-8?q?=E4=BB=A3CPU=E7=9A=84=E5=A4=A7=E8=AF=AD=E8=A8=80=E6=A8=A1?= =?UTF-8?q?=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91=E6=8C=87=E5=8D=97?= =?UTF-8?q?.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" index cbb3c5b..6eabe7d 100644 --- "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" +++ "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" @@ -10,7 +10,7 @@ https://docs.conda.io/en/latest/miniconda.html#windows-installers conda create -n llm python=3.9 conda activate llm pip install --pre --upgrade bigdl-llm[all] - pip install gradio mdtex2html + pip install gradio==3.41.1 mdtex2html ``` 或者用指定版本的方式安装 ``` From 7f35187aebfb790be86097b6452c52eb557556c6 Mon Sep 17 00:00:00 2001 From: KiwiHana <102839943+KiwiHana@users.noreply.github.com> Date: Wed, 25 Oct 2023 00:15:26 +0800 Subject: [PATCH 09/15] =?UTF-8?q?Update=20=E5=9F=BA=E4=BA=8EIntel13?= =?UTF-8?q?=E4=BB=A3CPU=E7=9A=84=E5=A4=A7=E8=AF=AD=E8=A8=80=E6=A8=A1?= =?UTF-8?q?=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91=E6=8C=87=E5=8D=97?= =?UTF-8?q?.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...00\345\217\221\346\214\207\345\215\227.md" | 30 +++++++++++++++++-- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" index 6eabe7d..fe93e4e 100644 --- "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" +++ "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" @@ -48,11 +48,34 @@ n_threads=CPU大核数*2+小核数 -2 不同设备可以尝试这3个参数,选择一个最优参数。 -对于Xeon,OMP_NUM_THREADS 和 n_threads是第一个socket的物理核数量 +对于Xeon,OMP_NUM_THREADS 和 n_threads是第一个socket的物理核数量。 +对于有2个socket的SPR,需要指定用第一个socket所有物理核进行推理。 + +对于Ubuntu平台,假设SPR第一个socket有48个物理核。numactl -C 0-47 -m 0 $command ``` +sudo apt install numactl +conda create -n llm python=3.9 +conda activate llm +pip install bigdl-llm[all] +pip install bigdl-nano +source bigdl-nano-init -c export OMP_NUM_THREADS=48 -numactl -C 0-47 -m 0 jupyter notebook +$ numactl -C 0-47 -m 0 llm-cli -t 48 -x chatglm -m "./checkpoint/bigdl_llm_chatglm_q4_0.bin" -p "Once upon +a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new +people, and have fun" --no-mmap -v -n 32 +``` + +对于windows平台,假设SPR第一个socket有48个物理核。start /node 0 $command +``` +conda create -n llm python=3.9 +conda activate llm +pip install bigdl-llm[all] +pip install bigdl-nano +> start /node 0 llm-cli -t 48 -x chatglm -m "./checkpoint/bigdl_llm_chatglm_q4_0.bin" -p "Once upon +a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new +people, and have fun" --no-mmap -v -n 32 ``` + (2)n_ctx=4096表示模型最长的输入+输出文本等于4096 tokens ``` from bigdl.llm.ggml.model.chatglm.chatglm import ChatGLM @@ -94,8 +117,9 @@ elif model_name == "llama2-13b" or model_name == "StarCoder": model = AutoModelForCausalLM.from_pretrained(model_name_local, trust_remote_code=True, load_in_4bit=True) model.save_low_bit("D:\\llm-models\\"+model_name) tokenizer.save_pretrained("D:\\llm-models\\"+model_name) - +``` python调用transformer INT4模型 +``` if model_name == "chatglm2-6b": model = AutoModel.load_low_bit(model_name_local,trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name_local,trust_remote_code=True) From e5e3ab1ec0ab811867373d0144fe01735b653dd5 Mon Sep 17 00:00:00 2001 From: KiwiHana <102839943+KiwiHana@users.noreply.github.com> Date: Wed, 25 Oct 2023 00:16:35 +0800 Subject: [PATCH 10/15] =?UTF-8?q?Update=20=E5=9F=BA=E4=BA=8EIntel13?= =?UTF-8?q?=E4=BB=A3CPU=E7=9A=84=E5=A4=A7=E8=AF=AD=E8=A8=80=E6=A8=A1?= =?UTF-8?q?=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91=E6=8C=87=E5=8D=97?= =?UTF-8?q?.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...5\274\200\345\217\221\346\214\207\345\215\227.md" | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" index fe93e4e..9f70a1e 100644 --- "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" +++ "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" @@ -135,12 +135,12 @@ elif model_name == "llama2-13b" or model_name == "StarCoder": 打开Anaconda PowerShell Prompt ``` conda activate llm -ChatGLM2: - llm-cli -t 20 -x chatglm -m "ggml-chatglm2-6b-q4_0.bin" -p "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" --no-mmap -v -n 32 -Llama2: - llm-cli -t 20 -x llama -m "bigdl_llm_llama2_13b_q4_0.bin" -p "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" --no-mmap -n 32 -Starcoder: - llm-cli -t 20 -x starcoder -m "bigdl_llm_starcoder_q4_0.bin" -p "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" --no-mmap -n 32 +#ChatGLM2: +$ llm-cli -t 20 -x chatglm -m "ggml-chatglm2-6b-q4_0.bin" -p "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" --no-mmap -v -n 32 +#Llama2: +$ llm-cli -t 20 -x llama -m "bigdl_llm_llama2_13b_q4_0.bin" -p "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" --no-mmap -n 32 +#Starcoder: +$ llm-cli -t 20 -x starcoder -m "bigdl_llm_starcoder_q4_0.bin" -p "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" --no-mmap -n 32 ``` 参数解释:-n 32限制输出32 tokens。 From 6b28198f6eb1c109ff9f52951fe7bf9cbc4a0fbe Mon Sep 17 00:00:00 2001 From: KiwiHana <102839943+KiwiHana@users.noreply.github.com> Date: Wed, 25 Oct 2023 00:20:19 +0800 Subject: [PATCH 11/15] =?UTF-8?q?Update=20=E5=9F=BA=E4=BA=8EIntel13?= =?UTF-8?q?=E4=BB=A3CPU=E7=9A=84=E5=A4=A7=E8=AF=AD=E8=A8=80=E6=A8=A1?= =?UTF-8?q?=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91=E6=8C=87=E5=8D=97?= =?UTF-8?q?.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" | 3 --- 1 file changed, 3 deletions(-) diff --git "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" index 9f70a1e..5809166 100644 --- "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" +++ "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" @@ -226,9 +226,6 @@ with torch.inference_mode(): timeCost = time.time() - timeStart token_count_input = len(tokenizer.tokenize(prompt)) - token_count_output = int(out[0,2])+1 - ms_first_token = float(out[0,0]) - ms_after_token = float(out[0,1]) ## Tensor out的定义在benchmark_util.py L2476 ``` ## 5 添加history多轮对话功能 From a824a036889ccbdc8eff6194c2f4603a3d84400d Mon Sep 17 00:00:00 2001 From: KiwiHana <102839943+KiwiHana@users.noreply.github.com> Date: Mon, 13 Nov 2023 17:43:40 +0800 Subject: [PATCH 12/15] =?UTF-8?q?Create=20=E5=9F=BA=E4=BA=8EXeon=E6=88=96S?= =?UTF-8?q?PR=E7=9A=84=E5=A4=A7=E8=AF=AD=E8=A8=80=E6=A8=A1=E5=9E=8B?= =?UTF-8?q?=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...24\347\224\250\345\274\200\345\217\221.md" | 309 ++++++++++++++++++ 1 file changed, 309 insertions(+) create mode 100644 "Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\346\210\226SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md" diff --git "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\346\210\226SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md" "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\346\210\226SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md" new file mode 100644 index 0000000..1737f1b --- /dev/null +++ "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\346\210\226SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md" @@ -0,0 +1,309 @@ +# 基于Intel Xeon和SPR的大语言模型应用开发指南 + +本文档介绍如何开发大语言模型应用UI,基于开源的intel bigdl-llm库和gradio。 +UI跑在windows11 x86 CPU上或者Ubuntu CPU上,实现在6根16GB内存以上运行优化的Native INT4 大语言模型。 +以三个大语言模型为例,ChatGLM2 (6B)中英,LLaMA2 (13B)英。 + +## 1 安装环境 +(1)Windows11安装Miniconda3-py39_23.5.2-0-Windows-x86_64.exe,下载链接: +https://docs.conda.io/en/latest/miniconda.html#windows-installers + +如果是Ubuntu,下载安装 +``` +wget https://repo.anaconda.com/miniconda/Miniconda3-py39_23.5.2-0-Linux-x86_64.sh +chmod -R 777 Miniconda3-py39_23.5.2-0-Linux-x86_64.shy +./Miniconda3-py39_23.5.2-0-Linux-x86_64.sh +sudo apt install numactl +``` + +(2)打开Anaconda Powershell Prompt窗口 +``` +conda create -n llm python=3.9 +conda activate llm +pip install --pre --upgrade bigdl-llm[all] +pip install bigdl-nano +pip install gradio==3.41.1 mdtex2html +``` +或者用指定版本的方式安装 +``` + pip install --pre bigdl-llm[all]==2.4.0b20231110 -i https://pypi.tuna.tsinghua.edu.cn/simple +``` +## 2 LLM模型转换 +以Chatglm2,llama2,starcoder为例,下载hugging face FP16模型。模型下载链接: + +· ChatGLM2-6B:https://huggingface.co/THUDM/chatglm2-6b/tree/main + +· Llama2-13B: https://huggingface.co/meta-llama/Llama-2-13b-chat-hf/tree/main + + +### 2.1 FP16转Native INT4模型,并调用python函数 (推荐运行在CPU) +Chatglm2 ,llama2 转native INT4。 + +打开Anaconda PowerShell,修改模型路径和输出文件夹名称,并运行: +``` + conda activate llm + llm-convert "/llm-models/chatglm2-6b/" --model-format pth --model-family "chatglm" --outfile "checkpoint/" + llm-convert "/llm-models/llama-2-13b-chat-hf/" --model-format pth --model-family "llama" --outfile "checkpoint/" +``` + + +#### python调用Native INT4模型。 +参数解释: + +(1)n_threads +对于Xeon,OMP_NUM_THREADS 和 n_threads是第一个socket的物理核数量。 +对于有2个socket的SPR,需要指定用第一个socket所有物理核进行推理。 + +(2)n_ctx=4096表示模型最长的输入+输出文本等于4096 tokens +``` +from bigdl.llm.ggml.model.chatglm.chatglm import ChatGLM +from bigdl.llm.transformers import BigdlNativeForCausalLM +model_name = "chatglm2-6b" +model_all_local_path = "C:\\PC_LLM\\checkpoint\\" +if model_name == "chatglm2-6b": + model = ChatGLM(model_all_local_path + "\\ggml-chatglm2-6b-q4_0.bin", n_threads=20,n_ctx=4096) + +elif model_name == "llama2-13b": + model = BigdlNativeForCausalLM.from_pretrained( + pretrained_model_name_or_path=model_all_local_path + "\\bigdl_llm_llama2_13b_q4_0.bin", + model_family='llama',n_threads=20,n_ctx=4096) +``` +### 2.2 FP16转transformer INT4,并调用python函数 +Transformer INT4在CPU上运行性能比Native INT4低一些。 + +用python脚本转换模型为transformer INT4 +``` +from bigdl.llm.transformers import AutoModel +from transformers import AutoTokenizer +from bigdl.llm.transformers import AutoModelForCausalLM +model_name = "chatglm2-6b" +model_all_local_path = "C:\\PC_LLM\\checkpoint\\" +model_name_local = model_all_local_path + model_name + +if model_name == "chatglm2-6b": + tokenizer = AutoTokenizer.from_pretrained(model_name_local, trust_remote_code=True) + model = AutoModel.from_pretrained(model_name_local, trust_remote_code=True, load_in_4bit=True) + model.save_low_bit("D:\\llm-models\\chatglm2-6b-int4\\") + tokenizer.save_pretrained("D:\\llm-models\\chatglm2-6b-int4\\") + +elif model_name == "llama2-13b": + tokenizer = AutoTokenizer.from_pretrained(model_name_local, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name_local, trust_remote_code=True, load_in_4bit=True) + model.save_low_bit("D:\\llm-models\\"+model_name) + tokenizer.save_pretrained("D:\\llm-models\\"+model_name) +``` +python调用transformer INT4模型 +``` +if model_name == "chatglm2-6b": + model = AutoModel.load_low_bit(model_name_local,trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_name_local,trust_remote_code=True) + model = model.eval() +elif model_name == "llama2-13b": + model = AutoModelForCausalLM.load_low_bit(model_name_local,trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_name_local,trust_remote_code=True) + model = model.eval() +``` +## 3 测试LLM benchmark on CPU +使用Native INT4模型测试LLM benchmark on CPU将会使用所有核,方便和应用UI的性能指标相比较。 + +对于Ubuntu平台,假设SPR第一个socket有48个物理核。numactl -C 0-47 -m 0 $command +``` +$ lscpu +NUMA node0 CPU(s): 0-47,96-143 +NUMA node1 CPU(s): 48-95,144-191 + +Therefore, you will set parameters like: +$ export OMP_NUM_THREADS=48 +$ numactl -C 0-47 -m 0 llm-cli -t 48 …… +``` + +``` +sudo apt install numactl +conda create -n llm python=3.9 +conda activate llm +pip install bigdl-llm[all] +pip install bigdl-nano +source bigdl-nano-init -c +export OMP_NUM_THREADS=48 +export TRANSFORMERS_OFFLINE=1 +$ numactl -C 0-47 -m 0 llm-cli -t 48 -x chatglm -m "./checkpoint/bigdl_llm_chatglm_q4_0.bin" -p "Once upon +a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new +people, and have fun" --no-mmap -v -n 32 + +numactl -C 0-47 -m 0 llm-cli -t 48 -x llama -m "bigdl_llm_llama2_13b_q4_0.bin" -p "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" --no-mmap -n 32 +``` + +对于windows平台,假设SPR第一个socket有48个物理核。start /node 0 $command +``` +conda create -n llm python=3.9 +conda activate llm +pip install bigdl-llm[all] +pip install bigdl-nano + +start /node 0 llm-cli -t 48 -x chatglm -m "./checkpoint/bigdl_llm_chatglm_q4_0.bin" -p "Once upon +a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new +people, and have fun" --no-mmap -v -n 32 +``` + +参数解释:-n 32限制输出32 tokens。 + +从command line提取性能信息,如图: + +Input token: 32 tokens + +Output token: 32 tokens (31 runs = 32 tokens – 1st token) + +1st token avg latency (ms) = 1541.56 ms + +2nd+ token avg latency (ms/token) = 125.62 ms per token + +图1:llm-cli的输出 + +## 4 用吐字的方式输出文本 +### 4.1(推荐运行在CPU):Native int4 for chatglm2,llama2和starcoder。 +``` +from bigdl.llm.ggml.model.chatglm.chatglm import ChatGLM +model_name = "chatglm2-6b" +model_all_local_path = "C:\\PC_LLM\\checkpoint\\" +prompt = "What is AI?" +if model_name == "chatglm2-6b": + model = ChatGLM(model_all_local_path + "\\ggml-chatglm2-6b-q4_0.bin", n_threads=20,n_ctx=4096) + response = "" + for chunk in model(prompt, temperature=0.95,top_p=0.8,stream=True,max_tokens=512): + response += chunk['choices'][0]['text'] +``` +llama2和starcoder的吐字调用方式相同,也是用for循环。 + +参数说明: + +· 温度(Temperature)(数值越高,输出的随机性增加),可调范围0~1 + +· Top P(数值越高,词语选择的多样性增加),可调范围0~1 + +· 输出最大长度(Max Length)(输出文本的最大tokens),可调范围0~2048,上限由模型决定。这三个模型n_ctx最大8k,输入+输出tokens应小于8k。 + +### 4.2 Transformer INT4 stream_chat仅限chatglm2 +``` +from bigdl.llm.transformers import AutoModel +from transformers import AutoTokenizer +import torch +model_name = "chatglm2-6b" +model_all_local_path = "C:\\PC_LLM\\checkpoint\\" +model_name_local = model_all_local_path + model_name +prompt = "What is AI?" +model = AutoModel.load_low_bit(model_name_local,trust_remote_code=True) +tokenizer = AutoTokenizer.from_pretrained(model_name_local,trust_remote_code=True) +model = model.eval() +with torch.inference_mode(): + for response, history in model.stream_chat(tokenizer, prompt, history,max_length=512, top_p=0.9,temperature=0.9): + print(response) +``` +### 4.3 Transformer INT4 TextIteratorStreamer for chatglm2,llama2和starcoder。 +``` +from bigdl.llm.transformers import AutoModel +from transformers import AutoTokenizer,TextIteratorStreamer +import torch +from benchmark_util import BenchmarkWrapper + +model_name = "chatglm2-6b" +model_all_local_path = "C:\\PC_LLM\\checkpoint\\" +model_name_local = model_all_local_path + model_name +model = AutoModel.load_low_bit(model_name_local,trust_remote_code=True) +tokenizer = AutoTokenizer.from_pretrained(model_name_local,trust_remote_code=True) +model = model.eval() +prompt = "What is AI?" +with torch.inference_mode(): + model=BenchmarkWrapper(model) + inputs = tokenizer(prompt, return_tensors="pt") + streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) + response = "" + timeStart = time.time() + # out = model.generate(**inputs, streamer=streamer, temperature=0.9, top_p=0.9, max_new_tokens=512) + generate_kwargs = dict(**inputs,streamer=streamer,temperature=0.9, top_p=0.9, max_new_tokens=512) + from threading import Thread + thread = Thread(target=model.generate, kwargs=generate_kwargs) + thread.start() + + for new_text in streamer: + response += new_text + timeCost = time.time() - timeStart + token_count_input = len(tokenizer.tokenize(prompt)) + +``` + +## 5 添加history多轮对话功能 +### 5.1 仅对chatglm2 Transformer INT4 stream_chat +代码参考4.2 +### 5.2对于Native int4添加history多轮对话功能 +``` +from bigdl.llm.ggml.model.chatglm.chatglm import ChatGLM +model_name = "chatglm2-6b" +model_all_local_path = "C:\\PC_LLM\\checkpoint\\" +history_round = 0 +history = [] +if model_name == "chatglm2-6b": + model = ChatGLM(model_all_local_path + "\\ggml-chatglm2-6b-q4_0.bin", n_threads=20,n_ctx=4096) +input = "你好" +predict(input) +input = "请进行丽江三天必游景点旅游规划" +predict(input) + +def predict(input): + global history_round, model, history + response = "" + if len(model.tokenize(history)) > 2500 or history_round >= 5: ### history record 5 rounds + history_round = 0 + history = [] + print("*********** reset chatbot and history", history) + + if len(history) == 0: + print("*********** new chat ") + prompt = input + history = prompt + history_round = 1 + else: + prompt = history + '\n' + input + history_round += 1 + print("******************* history_round ", history_round) + + timeStart = time.time() + for chunk in model(prompt, temperature=0.9,top_p=0.9, stream=True,max_tokens=512): + response += chunk['choices'][0]['text'] + history = prompt + response + print("******** max_length history",len(model.tokenize(history))) +``` +### 5.3 对于transformer INT4 TextIteratorStreamer同5.2 +## 6 用gradio写Web UI +下载代码:https://github.com/KiwiHana/LLM_UI_Windows_CPU + +![image](https://github.com/KiwiHana/bigdl-llm-tutorial/assets/102839943/5a399c7e-31b4-4337-a6a4-bc6f8bccb93c) +图2:LLM_UI_Windows_CPU界面 + + +为了使用全部核,用管理员打开Anaconda Powershell Prompt窗口,运行LLM_demo_v1.0.py 或 LLM_demo_v2.0.py。 +``` +git clone https://github.com/KiwiHana/LLM_UI_Windows_CPU.git +cd LLM_UI_Windows_CPU +conda activate llm +python LLM_demo_v1.0.py +``` +Note: 修改LLM_demo_v1.0.py脚本第285行 main函数里的模型存放路径, + +例如 model_all_local_path = "C:/Users/username/checkpoint/" + +· 大语言模型应用UIv1.0文件夹应包含: + +LLM_demo_v1.0.py + +theme3.json + +checkpoint + +-- bigdl_llm_llama2_13b_q4_0.bin + +-- ggml-chatglm2-6b-q4_0.bin + + +参考链接: + +https://github.com/intel-analytics/bigdl-llm-tutorial/tree/main/ch_2_Environment_Setup From 04597b9c4db8ad0e2be6d0838a22042642345818 Mon Sep 17 00:00:00 2001 From: KiwiHana <102839943+KiwiHana@users.noreply.github.com> Date: Mon, 13 Nov 2023 17:44:01 +0800 Subject: [PATCH 13/15] =?UTF-8?q?Rename=20=E5=9F=BA=E4=BA=8EXeon=E6=88=96S?= =?UTF-8?q?PR=E7=9A=84=E5=A4=A7=E8=AF=AD=E8=A8=80=E6=A8=A1=E5=9E=8B?= =?UTF-8?q?=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91.md=20to=20=E5=9F=BA?= =?UTF-8?q?=E4=BA=8EXeon=E5=92=8CSPR=E7=9A=84=E5=A4=A7=E8=AF=AD=E8=A8=80?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...45\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md" | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename "Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\346\210\226SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md" => "Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\345\222\214SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md" (100%) diff --git "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\346\210\226SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md" "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\345\222\214SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md" similarity index 100% rename from "Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\346\210\226SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md" rename to "Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\345\222\214SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md" From 61f4cf670366cc9da8f79b16e0c3bc6988e84614 Mon Sep 17 00:00:00 2001 From: KiwiHana <102839943+KiwiHana@users.noreply.github.com> Date: Mon, 13 Nov 2023 17:50:34 +0800 Subject: [PATCH 14/15] =?UTF-8?q?Update=20=E5=9F=BA=E4=BA=8EXeon=E5=92=8CS?= =?UTF-8?q?PR=E7=9A=84=E5=A4=A7=E8=AF=AD=E8=A8=80=E6=A8=A1=E5=9E=8B?= =?UTF-8?q?=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...\213\345\272\224\347\224\250\345\274\200\345\217\221.md" | 6 ++++++ 1 file changed, 6 insertions(+) diff --git "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\345\222\214SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md" "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\345\222\214SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md" index 1737f1b..fdd1bdc 100644 --- "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\345\222\214SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md" +++ "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Xeon\345\222\214SPR\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221.md" @@ -4,6 +4,10 @@ UI跑在windows11 x86 CPU上或者Ubuntu CPU上,实现在6根16GB内存以上运行优化的Native INT4 大语言模型。 以三个大语言模型为例,ChatGLM2 (6B)中英,LLaMA2 (13B)英。 +Note: 通常客户服务器出货系统,CPU的内存槽位不会插满,这种情况下必须要根据实际条数插到特定的槽位,否则会影响到系统稳定性与性能。以下是Eagle Stream内存插法示意图。 +![image](https://github.com/KiwiHana/bigdl-llm-tutorial/assets/102839943/a54c74cc-6581-4f9e-b2b4-3780bbcfe2a6) + + ## 1 安装环境 (1)Windows11安装Miniconda3-py39_23.5.2-0-Windows-x86_64.exe,下载链接: https://docs.conda.io/en/latest/miniconda.html#windows-installers @@ -158,6 +162,8 @@ Output token: 32 tokens (31 runs = 32 tokens – 1st token) 2nd+ token avg latency (ms/token) = 125.62 ms per token 图1:llm-cli的输出 +![image](https://github.com/KiwiHana/bigdl-llm-tutorial/assets/102839943/5adf144a-5fc5-432f-b476-f26d341fbced) + ## 4 用吐字的方式输出文本 ### 4.1(推荐运行在CPU):Native int4 for chatglm2,llama2和starcoder。 From ed5a6215817424f4159f6ef5dcf06459a6e8cb0f Mon Sep 17 00:00:00 2001 From: KiwiHana <102839943+KiwiHana@users.noreply.github.com> Date: Mon, 13 Nov 2023 17:57:40 +0800 Subject: [PATCH 15/15] =?UTF-8?q?Update=20=E5=9F=BA=E4=BA=8EIntel13?= =?UTF-8?q?=E4=BB=A3CPU=E7=9A=84=E5=A4=A7=E8=AF=AD=E8=A8=80=E6=A8=A1?= =?UTF-8?q?=E5=9E=8B=E5=BA=94=E7=94=A8=E5=BC=80=E5=8F=91=E6=8C=87=E5=8D=97?= =?UTF-8?q?.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...00\345\217\221\346\214\207\345\215\227.md" | 28 ------------------- 1 file changed, 28 deletions(-) diff --git "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" index 5809166..9537f8a 100644 --- "a/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" +++ "b/Chinese_Version /ch_8_Applications/\345\237\272\344\272\216Intel13\344\273\243CPU\347\232\204\345\244\247\350\257\255\350\250\200\346\250\241\345\236\213\345\272\224\347\224\250\345\274\200\345\217\221\346\214\207\345\215\227.md" @@ -48,34 +48,6 @@ n_threads=CPU大核数*2+小核数 -2 不同设备可以尝试这3个参数,选择一个最优参数。 -对于Xeon,OMP_NUM_THREADS 和 n_threads是第一个socket的物理核数量。 -对于有2个socket的SPR,需要指定用第一个socket所有物理核进行推理。 - -对于Ubuntu平台,假设SPR第一个socket有48个物理核。numactl -C 0-47 -m 0 $command -``` -sudo apt install numactl -conda create -n llm python=3.9 -conda activate llm -pip install bigdl-llm[all] -pip install bigdl-nano -source bigdl-nano-init -c -export OMP_NUM_THREADS=48 -$ numactl -C 0-47 -m 0 llm-cli -t 48 -x chatglm -m "./checkpoint/bigdl_llm_chatglm_q4_0.bin" -p "Once upon -a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new -people, and have fun" --no-mmap -v -n 32 -``` - -对于windows平台,假设SPR第一个socket有48个物理核。start /node 0 $command -``` -conda create -n llm python=3.9 -conda activate llm -pip install bigdl-llm[all] -pip install bigdl-nano -> start /node 0 llm-cli -t 48 -x chatglm -m "./checkpoint/bigdl_llm_chatglm_q4_0.bin" -p "Once upon -a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new -people, and have fun" --no-mmap -v -n 32 -``` - (2)n_ctx=4096表示模型最长的输入+输出文本等于4096 tokens ``` from bigdl.llm.ggml.model.chatglm.chatglm import ChatGLM