diff --git a/.github/actions/ppml/ppml-occlum-EDMM-exampletests-action/action.yml b/.github/actions/ppml/ppml-occlum-EDMM-exampletests-action/action.yml new file mode 100644 index 00000000000..28e251fb22d --- /dev/null +++ b/.github/actions/ppml/ppml-occlum-EDMM-exampletests-action/action.yml @@ -0,0 +1,192 @@ +name: 'Run PPML Occlum EDMM ExampleTests' +description: 'Run PPML Occlum EDMM ExampleTests' +inputs: + image: + description: 'image' + required: true + default: '10.239.45.10/arda/intelanalytics/bigdl-ppml-trusted-big-data-ml-scala-occlum' + image-tag: + description: 'image tag' + required: true + default: '2.4.0-SNAPSHOT-EDMM6' +runs: + using: "composite" + steps: + - name: Run tests + shell: bash + env: + DEFAULT_IMAGE: ${{ inputs.image }}:${{ inputs.image-tag }} + run: | + whoami + + # icx-6's kernel support EDMM + export LOCAL_IP=172.168.0.210 + export CPUSET="6-10" + export CONTAINER_NAME="spark-occlum-edmm-jenkins" + + export DATA_PATH=/home/icx/glorysdj/data + export KEYS_PATH=/home/icx/glorysdj/keys + export SECURE_PASSWORD_PATH=/home/icx/glorysdj/password + export SGX_MEM_SIZE=30GB + export SGX_KERNEL_HEAP=2GB + export IMAGE=${{ env.DEFAULT_IMAGE }} + + docker pull $IMAGE + docker pull intelanalytics/bigdl-ppml-trusted-big-data-ml-scala-occlum:2.4.0-SNAPSHOT-EDMM + + docker stop $CONTAINER_NAME + docker rm -f $CONTAINER_NAME + + docker run -itd \ + --net=host \ + --cpuset-cpus=$CPUSET \ + --oom-kill-disable \ + --device=/dev/sgx/enclave \ + --device=/dev/sgx/provision \ + -v /var/run/aesmd/aesm.socket:/var/run/aesmd/aesm.socket \ + -v $DATA_PATH:/opt/occlum_spark/data \ + -v $KEYS_PATH:/opt/keys \ + --name=$CONTAINER_NAME \ + -e LOCAL_IP=$LOCAL_IP \ + -e SGX_MEM_SIZE=$SGX_MEM_SIZE \ + -e SGX_KERNEL_HEAP=$SGX_KERNEL_HEAP \ + $IMAGE \ + bash -c "tail -f /dev/null" + + status_1_spark_pi=1 + status_2_bigdl_lenet_mnist=1 + status_3_bigdl_resnet_cifar10=1 + status_4_spark_tpch=1 + status_5_spark_ut=0 + status_6_spark_xgboost=1 + status_7_spark_gbt=1 + status_8_pyspark_sklearn=1 + status_9_pyspark_sql=1 + status_10_pyspark_tpch=1 + status_11_spark_lgbm=1 + + if [ $status_1_spark_pi -ne 0 ]; then + echo "################## start spark pi" + echo "example.1 spark pi" + docker exec -i $CONTAINER_NAME bash -c "cd /opt && \ + bash run_spark_on_occlum_glibc.sh pi | tee test-spark-pi-sgx.log && \ + cat test-spark-pi-sgx.log | egrep 'Pi is roughly 3'" + status_1_spark_pi=$(echo $?) + fi + + if [ $status_2_bigdl_lenet_mnist -ne 0 ]; then + echo "################## start bigdl lenet mnist" + echo "example.2 bigdl lenet mnist" + docker exec -i $CONTAINER_NAME bash -c "cd /opt && \ + sed -i 's# run_spark_lenet_mnist# run_spark_lenet_mnist -b 4 -e 1#g' run_spark_on_occlum_glibc.sh && \ + sed -i 's# -f /host/data# -f /host/data/lenet#g' run_spark_on_occlum_glibc.sh && \ + bash run_spark_on_occlum_glibc.sh lenet -b 8 -e 1 | tee bigdl-lenet-mnist.log && \ + cat bigdl-lenet-mnist.log | egrep 'records/second. Loss is' && \ + sed -i 's# -f /host/data/lenet# -f /host/data#g' run_spark_on_occlum_glibc.sh" + status_2_bigdl_lenet_mnist=$(echo $?) + fi + + if [ $status_3_bigdl_resnet_cifar10 -ne 0 ]; then + echo "################## start bigdl resnet cifar10" + echo "example.3 bigdl resnet cifar10" + docker exec -i $CONTAINER_NAME bash -c "cd /opt && \ + sed -i 's# run_spark_resnet_cifar# run_spark_resnet_cifar --nEpochs 1#g' run_spark_on_occlum_glibc.sh && \ + sed -i 's# -f /host/data# -f /host/data/cifar#g' run_spark_on_occlum_glibc.sh && \ + bash run_spark_on_occlum_glibc.sh resnet | tee bigdl-resnet-cifar10.log && \ + cat bigdl-resnet-cifar10.log | egrep 'Current momentum is '&& \ + sed -i 's# -f /host/data/cifar# -f /host/data#g' run_spark_on_occlum_glibc.sh" + status_3_bigdl_resnet_cifar10=$(echo $?) + fi + + if [ $status_4_spark_tpch -ne 0 ]; then + echo "################## start spark tpch" + echo "example.4 spark tpch" + docker exec -i $CONTAINER_NAME bash -c "cd /opt && \ + sed -i 's#spark.driver.memory=12g#spark.driver.memory=2g#g' run_spark_on_occlum_glibc.sh && \ + sed -i 's#spark.executor.instances=8#spark.executor.instances=2#g' run_spark_on_occlum_glibc.sh && \ + sed -i 's#executor-memory 8G#executor-memory 2G#g' run_spark_on_occlum_glibc.sh && \ + sed -i 's#-Xmx78g -Xms78g#-Xmx10g -Xms10g#g' run_spark_on_occlum_glibc.sh && \ + sed -i 's#/host/data /host/data/output#/host/data/tpch /host/data/output#g' run_spark_on_occlum_glibc.sh && \ + bash run_spark_on_occlum_glibc.sh tpch | tee spark-tpch.log && \ + cat spark-tpch.log | egrep '22 finished-'" + status_4_spark_tpch=$(echo $?) + fi + + if [ $status_5_spark_ut -ne 0 ]; then + echo "################## start spark unit test" + echo "example.5 spark unit test" + docker exec -i $CONTAINER_NAME bash -c "cd /opt && \ + sed -i 's#192.168.0.111#$LOCAL_IP#g' run_spark_on_occlum_glibc.sh && \ + bash run_spark_on_occlum_glibc.sh ut | tee spark-unit-test.log && \ + cat spark-unit-test.log | egrep 'FINISHED o.a.s.status.api.v1.sql.SqlResourceSuite:'" + status_5_spark_ut=$(echo $?) + fi + + if [ $status_6_spark_xgboost -ne 0 ]; then + echo "################## start spark xgboost" + echo "example.6 spark xgboost" + docker exec -i $CONTAINER_NAME bash -c "cd /opt && \ + sed -i 's#-i /host/data -s /host/data/model -t 2 -r 100 -d 2 -w 1#-i /host/data/xgboost -s /host/data/xgboost/model -t 2 -r 10 -d 2 -w 1#g' run_spark_on_occlum_glibc.sh && \ + bash run_spark_on_occlum_glibc.sh xgboost | tee spark-xgboost.log && \ + cat spark-xgboost.log | egrep 'end time is'" + status_6_spark_xgboost=$(echo $?) + fi + + if [ $status_7_spark_gbt -ne 0 ]; then + echo "################## start spark gbt" + echo "example.7 spark gbt" + docker exec -i $CONTAINER_NAME bash -c "cd /opt && \ + sed -i 's#-i /host/data -s /host/data/model -I 100 -d 5#-i /host/data/gbt -s /host/data/gbt/model -I 10 -d 5#g' run_spark_on_occlum_glibc.sh && \ + bash run_spark_on_occlum_glibc.sh gbt | tee spark-gbt.log && \ + cat spark-gbt.log | egrep 'end time is'" + status_7_spark_gbt=$(echo $?) + fi + + if [ $status_8_pyspark_sklearn -ne 0 ]; then + echo "################## start pyspark sklearn Linear Regression" + echo "example.8 pyspark sklearn" + docker exec -i $CONTAINER_NAME bash -c "cd /opt && \ + bash run_spark_on_occlum_glibc.sh pysklearn | tee test-pyspark-sklearn-sgx.log && \ + cat test-pyspark-sklearn-sgx.log | egrep 'mean_squared_error'" + status_8_pyspark_sklearn=$(echo $?) + fi + + if [ $status_9_pyspark_sql -ne 0 ]; then + echo "################## start pyspark SQL example" + echo "example.9 pyspark sql" + docker exec -i $CONTAINER_NAME bash -c "cd /opt && \ + bash run_spark_on_occlum_glibc.sh pysql | tee test-pyspark-sql-sgx.log && \ + cat test-pyspark-sql-sgx.log | egrep 'Example API finished'" + status_9_pyspark_sql=$(echo $?) + fi + + if [ $status_10_pyspark_tpch -ne 0 ]; then + echo "################## start pyspark tpch" + echo "example.10 pyspark tpch" + docker exec -i $CONTAINER_NAME bash -c "cd /opt && \ + sed -i 's#/host/data/ /host/data/output/ true#/host/data/tpch/ /host/data/output/ false#g' run_spark_on_occlum_glibc.sh && \ + bash run_spark_on_occlum_glibc.sh pytpch | tee pyspark-tpch.log && \ + cat pyspark-tpch.log | egrep 'total time is'" + status_10_pyspark_tpch=$(echo $?) + fi + + if [ $status_11_spark_lgbm -ne 0 ]; then + echo "################## start spark lgbm" + echo "example.11 spark lgbm" + docker exec -i $CONTAINER_NAME bash -c "cd /opt && \ + bash run_spark_on_occlum_glibc.sh lgbm | tee spark-lgbm.log && \ + cat spark-lgbm.log | egrep 'acc:'" + status_11_spark_lgbm=$(echo $?) + fi + + echo "status_1_spark_pi $status_1_spark_pi" + echo "status_2_bigdl_lenet_mnist $status_2_bigdl_lenet_mnist" + echo "status_3_bigdl_resnet_cifar10 $status_3_bigdl_resnet_cifar10" + echo "status_4_spark_tpch $status_4_spark_tpch" + #echo "status_5_spark_ut $status_5_spark_ut" + echo "status_6_spark_xgboost $status_6_spark_xgboost" + echo "status_7_spark_gbt $status_7_spark_gbt" + echo "status_8_pyspark_sklearn $status_8_pyspark_sklearn" + echo "status_9_pyspark_sql $status_9_pyspark_sql" + echo "status_10_pyspark_tpch $status_10_pyspark_tpch" + echo "status_11_spark_lgbm $status_11_spark_lgbm" diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml index 93ab24838b2..815f493239a 100644 --- a/.github/workflows/llm_performance_tests.yml +++ b/.github/workflows/llm_performance_tests.yml @@ -36,6 +36,10 @@ jobs: env: THREAD_NUM: 24 steps: + - name: Set environment variables + shell: bash + run: | + echo "LLAMA2_7B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-7b-chat-hf" >> "$GITHUB_ENV" - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 @@ -55,6 +59,14 @@ jobs: env: ANALYTICS_ZOO_ROOT: ${{ github.workspace }} + - name: Download LLMs + shell: bash + run: | + if [ ! -d $LLAMA2_7B_ORIGIN_PATH ]; then + echo "Directory $LLAMA2_7B_ORIGIN_PATH not found. Downloading from FTP server..." + wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Llama-2-7b-chat-hf -P $ORIGIN_DIR + fi + - name: Run LLM Performance test env: ANALYTICS_ZOO_ROOT: ${{ github.workspace }} @@ -76,10 +88,6 @@ jobs: THREAD_NUM: 16 ANALYTICS_ZOO_ROOT: ${{ github.workspace }} steps: - - name: Set model directories - shell: bash - run: | - echo "ORIGIN_DIR=/mnt/disk1/models" >> "$GITHUB_ENV" - name: Set environment variables shell: bash run: | @@ -87,6 +95,7 @@ jobs: echo "LLAMA2_13B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-13b-chat-hf" >> "$GITHUB_ENV" echo "CHATGLM2_6B_ORIGIN_PATH=${ORIGIN_DIR}/chatglm2-6b" >> "$GITHUB_ENV" echo "WHISPER_MEDIUM_ORIGIN_PATH=${ORIGIN_DIR}/whisper-medium" >> "$GITHUB_ENV" + - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 @@ -112,6 +121,27 @@ jobs: run: | source /opt/intel/oneapi/setvars.sh bash python/llm/test/run-llm-install-tests.sh + + - name: Download LLMs + shell: bash + run: | + if [ ! -d $LLAMA2_7B_ORIGIN_PATH ]; then + echo "Directory $LLAMA2_7B_ORIGIN_PATH not found. Downloading from FTP server..." + wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Llama-2-7b-chat-hf -P $ORIGIN_DIR + fi + if [ ! -d $LLAMA2_13B_ORIGIN_PATH ]; then + echo "Directory $LLAMA2_13B_ORIGIN_PATH not found. Downloading from FTP server..." + wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Llama-2-13b-chat-hf -P $ORIGIN_DIR + fi + if [ ! -d $CHATGLM2_6B_ORIGIN_PATH ]; then + echo "Directory $CHATGLM2_6B_ORIGIN_PATH not found. Downloading from FTP server..." + wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR + fi + if [ ! -d $WHISPER_MEDIUM_ORIGIN_PATH ]; then + echo "Directory $WHISPER_MEDIUM_ORIGIN_PATH not found. Downloading from FTP server..." + wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/whisper-medium -P $ORIGIN_DIR + fi + - name: Test on xpu shell: bash run: | diff --git a/.github/workflows/nightly_test.yml b/.github/workflows/nightly_test.yml index 0daf89a3272..d316a9ce287 100644 --- a/.github/workflows/nightly_test.yml +++ b/.github/workflows/nightly_test.yml @@ -3,7 +3,7 @@ name: Nightly Test on: #pull_request: - #branches: [ main ] + # branches: [ main ] schedule: - cron: '30 15 * * *' # GMT time, 15:30 GMT == 23:30 China @@ -68,6 +68,7 @@ on: - PPML-Scala-UT - PPML-Python-UT-Spark3 - PPML-Occlum-ExampleTests + - PPML-Occlum-EDMM-ExampleTests - PPML-spark-Local-SimpleQuery-Tests-on-Gramine - PPML-RealTime-ML-Occlum - PPML-RealTime-ML-Occlum-K8s @@ -1402,6 +1403,40 @@ jobs: job-name: PPML-Occlum-ExampleTests runner-hosted-on: 'Shanghai' + PPML-Occlum-EDMM-ExampleTests: + if: ${{ github.event.inputs.artifact == 'PPML-Occlum-EDMM-ExampleTests' }} + runs-on: [self-hosted, EDMM] + + steps: + - uses: actions/checkout@v3 + - name: Set up JDK8 + uses: ./.github/actions/jdk-setup-action + - name: Set up maven + uses: ./.github/actions/maven-setup-action + - name: set env + env: + DEFAULT_IMAGE: '10.239.45.10/arda/intelanalytics/bigdl-ppml-trusted-big-data-ml-scala-occlum' + DEFAULT_TAG: '2.4.0-SNAPSHOT-EDMM' + run: | + echo "TAG=${{ github.event.inputs.tag || env.DEFAULT_TAG }}" >> $GITHUB_ENV + echo "IMAGE=${{ github.event.inputs.image || env.DEFAULT_IMAGE }}" >> $GITHUB_ENV + - name: Run Test + uses: ./.github/actions/ppml/ppml-occlum-EDMM-exampletests-action + with: + image: ${{env.IMAGE}} + image-tag: ${{env.TAG}} + - name: Create Job Badge + uses: ./.github/actions/create-job-status-badge + if: ${{ always() }} + with: + secret: ${{ secrets.GIST_SECRET}} + gist-id: ${{env.GIST_ID}} + is-self-hosted-runner: true + file-name: PPML-Occlum-EDMM-ExampleTests.json + type: job + job-name: PPML-Occlum-EDMM-ExampleTests + runner-hosted-on: 'Shanghai' + PPML-RealTime-ML-Occlum: if: ${{ github.event.schedule || github.event.inputs.artifact == 'PPML-RealTime-ML-Occlum' || github.event.inputs.artifact == 'all' }} runs-on: [self-hosted, Vilvarin] diff --git a/README.md b/README.md index 841d9bcd445..3b6373373d7 100644 --- a/README.md +++ b/README.md @@ -12,8 +12,8 @@ > *It is built on top of the excellent work of [llama.cpp](https://github.com/ggerganov/llama.cpp), [ggml](https://github.com/ggerganov/ggml), [gptq](https://github.com/IST-DASLab/gptq), [bitsandbytes](https://github.com/TimDettmers/bitsandbytes), [qlora](https://github.com/artidoro/qlora), [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), [gptq_for_llama](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [chatglm.cpp](https://github.com/li-plus/chatglm.cpp), [redpajama.cpp](https://github.com/togethercomputer/redpajama.cpp), [gptneox.cpp](https://github.com/byroneverson/gptneox.cpp), [bloomz.cpp](https://github.com/NouamaneTazi/bloomz.cpp/), etc.* ### Latest update -- **[New]** `bigdl-llm` now supports QLoRA fintuning on Intel GPU; see the the example [here](python/llm/example/gpu/qlora_finetuning). -- `bigdl-llm` now supports Intel GPU (including Arc, Flex and MAX); see the the latest GPU examples [here](python/llm/example/gpu). +- **[New]** `bigdl-llm` now supports QLoRA fintuning on Intel GPU; see the the example [here](python/llm/example/GPU/QLoRA-FineTuning). +- `bigdl-llm` now supports Intel GPU (including Arc, Flex and MAX); see the the latest GPU examples [here](python/llm/example/GPU). - `bigdl-llm` tutorial is released [here](https://github.com/intel-analytics/bigdl-llm-tutorial). - Over 20 models have been optimized/verified on `bigdl-llm`, including *LLaMA/LLaMA2, ChatGLM/ChatGLM2, MPT, Falcon, Dolly, StarCoder, Whisper, InternLM, QWen, Baichuan, Aquila, MOSS,* and more; see the complete list [here](python/llm/README.md#verified-models). @@ -76,7 +76,7 @@ input_ids = tokenizer.encode(input_str, ...) output_ids = model.generate(input_ids, ...) output = tokenizer.batch_decode(output_ids) ``` -*See the complete examples [here](python/llm/example/transformers/transformers_int4/).* +*See the complete examples [here](python/llm/example/CPU/HF-Transformers-AutoModels/Model).* #### GPU INT4 ##### Install @@ -105,7 +105,7 @@ input_ids = tokenizer.encode(input_str, ...).to('xpu') output_ids = model.generate(input_ids, ...) output = tokenizer.batch_decode(output_ids.cpu()) ``` -*See the complete examples [here](python/llm/example/gpu/).* +*See the complete examples [here](python/llm/example/GPU).* #### More Low-Bit Support ##### Save and load @@ -115,7 +115,7 @@ After the model is optimized using `bigdl-llm`, you may save and load the model model.save_low_bit(model_path) new_model = AutoModelForCausalLM.load_low_bit(model_path) ``` -*See the complete example [here](python/llm/example/transformers/transformers_low_bit/).* +*See the complete example [here](python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load).* ##### Additonal data types @@ -123,7 +123,7 @@ In addition to INT4, You may apply other low bit optimizations (such as *INT8*, ```python model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_low_bit="sym_int8") ``` -*See the complete example [here](python/llm/example/transformers/transformers_low_bit/).* +*See the complete example [here](python/llm/example/CPU/HF-Transformers-AutoModels/More-Data-Types).* ***For more details, please refer to the `bigdl-llm` [Document](https://test-bigdl-llm.readthedocs.io/en/main/doc/LLM/index.html), [Readme](python/llm), [Tutorial](https://github.com/intel-analytics/bigdl-llm-tutorial) and [API Doc](https://bigdl.readthedocs.io/en/latest/doc/PythonAPI/LLM/index.html).*** diff --git a/docker/llm/finetune/lora/README.md b/docker/llm/finetune/lora/README.md deleted file mode 100644 index 98b694cfb21..00000000000 --- a/docker/llm/finetune/lora/README.md +++ /dev/null @@ -1,112 +0,0 @@ -## Run BF16-Optimized Lora Finetuning on Kubernetes with OneCCL - -[Alpaca Lora](https://github.com/tloen/alpaca-lora/tree/main) uses [low-rank adaption](https://arxiv.org/pdf/2106.09685.pdf) to speed up the finetuning process of base model [Llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b), and tries to reproduce the standard Alpaca, a general finetuned LLM. This is on top of Hugging Face transformers with Pytorch backend, which natively requires a number of expensive GPU resources and takes significant time. - -By constract, BigDL here provides a CPU optimization to accelerate the lora finetuning of Llama2-7b, in the power of mixed-precision and distributed training. Detailedly, [Intel OneCCL](https://www.intel.com/content/www/us/en/developer/tools/oneapi/oneccl.html), an available Hugging Face backend, is able to speed up the Pytorch computation with BF16 datatype on CPUs, as well as parallel processing on Kubernetes enabled by [Intel MPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/mpi-library.html). - -The architecture is illustrated in the following: - -![image](https://github.com/Jasonzzt/BigDL/assets/60865256/b66416bc-ad07-49af-8cb0-8967dffb5f58) - -As above, BigDL implements its MPI training build on [Kubeflow MPI operator](https://github.com/kubeflow/mpi-operator/tree/master), which encapsulates the deployment as MPIJob CRD, and assists users to handle the construction of a MPI worker cluster on Kubernetes, such as public key distribution, SSH connection, and log collection. - -Now, let's go to deploy a Lora finetuning to create a LLM from Llama2-7b. - -**Note: Please make sure you have already have an available Kubernetes infrastructure and NFS shared storage, and install [Helm CLI](https://helm.sh/docs/helm/helm_install/) for Kubernetes job submission.** - -### 1. Install Kubeflow MPI Operator - -Follow [here](https://github.com/kubeflow/mpi-operator/tree/master#installation) to install a Kubeflow MPI operator in your Kubernetes, which will listen and receive the following MPIJob request at backend. - -### 2. Download Image, Base Model and Finetuning Data - -Follow [here](https://github.com/intel-analytics/BigDL/tree/main/docker/llm/finetune/lora/docker#prepare-bigdl-image-for-lora-finetuning) to prepare BigDL Lora Finetuning image in your cluster. - -As finetuning is from a base model, first download [Llama2-7b model from the public download site of Hugging Face](https://huggingface.co/meta-llama/Llama-2-7b). Then, download [cleaned alpaca data](https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_cleaned_archive.json), which contains all kinds of general knowledge and has already been cleaned. Next, move the downloaded files to a shared directory on your NFS server. - -### 3. Deploy through Helm Chart - -You are allowed to edit and experiment with different parameters in `./kubernetes/values.yaml` to improve finetuning performance and accuracy. For example, you can adjust `trainerNum` and `cpuPerPod` according to node and CPU core numbers in your cluster to make full use of these resources, and different `microBatchSize` result in different training speed and loss (here note that `microBatchSize`×`trainerNum` should not more than 128, as it is the batch size). - -**Note: `dataSubPath` and `modelSubPath` need to have the same names as files under the NFS directory in step 2.** - -After preparing parameters in `./kubernetes/values.yaml`, submit the job as beflow: - -```bash -cd ./kubernetes -helm install bigdl-lora-finetuning . -``` - -### 4. Check Deployment -```bash -kubectl get all -n bigdl-lora-finetuning # you will see launcher and worker pods running -``` - -### 5. Check Finetuning Process - -After deploying successfully, you can find a launcher pod, and then go inside this pod and check the logs collected from all workers. - -```bash -kubectl get all -n bigdl-lora-finetuning # you will see a launcher pod -kubectl exec -it bash -n bigdl-ppml-finetuning # enter launcher pod -cat launcher.log # display logs collected from other workers -``` - -From the log, you can see whether finetuning process has been invoked successfully in all MPI worker pods, and a progress bar with finetuning speed and estimated time will be showed after some data preprocessing steps (this may take quiet a while). - -For the fine-tuned model, it is written by the worker 0 (who holds rank 0), so you can find the model output inside the pod, which can be saved to host by command tools like `kubectl cp` or `scp`. - - -## To run in TDX-CoCo and enable Remote Attestation API - -You can deploy this workload in TDX CoCo and enable Remote Attestation API Serving with setting `TEEMode` in `./kubernetes/values.yaml` to `tdx`. The main diffences are it's need to execute the pods as root and mount TDX device, and a flask service is responsible for generating launcher's quote and collecting workers' quotes. - -### (Optional) Enable TLS -To enable TLS in Remote Attestation API Serving, you should provide a TLS certificate and setting `enableTLS` ( to `true` ), `base64ServerCrt` and `base64ServerKey` in `./kubernetes/values.yaml`. -```bash -# Generate a self-signed TLS certificate (DEBUG USE ONLY) -export COUNTRY_NAME=your_country_name -export CITY_NAME=your_city_name -export ORGANIZATION_NAME=your_organization_name -export COMMON_NAME=your_common_name -export EMAIL_ADDRESS=your_email_address - -openssl req -x509 -newkey rsa:4096 -nodes -out server.crt -keyout server.key -days 365 -subj "/C=$COUNTRY_NAME/ST=$CITY_NAME/L=$CITY_NAME/O=$ORGANIZATION_NAME/OU=$ORGANIZATION_NAME/CN=$COMMON_NAME/emailAddress=$EMAIL_ADDRESS/" - -# Calculate Base64 format string in values.yaml -cat server.crt | base64 -w 0 # Set in base64ServerCrt -cat server.key | base64 -w 0 # Set in base64ServerKey -``` - -To use RA Rest API, you need to get the IP of job-launcher: -``` bash -kubectl get all -n bigdl-lora-finetuning -``` -You will find a line like: -```bash -service/bigdl-lora-finetuning-launcher-attestation-api-service ClusterIP 10.109.87.248 9870/TCP 17m -``` -Here are IP and port of the Remote Attestation API service. - -The RA Rest API are listed below: -### 1. Generate launcher's quote -```bash -curl -X POST -H "Content-Type: application/json" -d '{"user_report_data": ""}' http://:/gen_quote -``` - -Example responce: - -```json -{"quote":"BAACAIEAAAAAAAA..."} -``` -### 2. Collect all cluster components' quotes (launcher and workers) -```bash -curl -X POST -H "Content-Type: application/json" -d '{"user_report_data": ""}' http://:/attest -``` - -Example responce: - -```json -{"quote_list":{"bigdl-lora-finetuning-job-worker-0":"BAACAIEAAAAAAA...","bigdl-lora-finetuning-job-worker-1":"BAACAIEAAAAAAA...","launcher":"BAACAIEAAAAAA..."}} -``` - diff --git a/docker/llm/finetune/lora/cpu/docker/README.md b/docker/llm/finetune/lora/cpu/docker/README.md index e988f8f049d..be86f2b22c9 100644 --- a/docker/llm/finetune/lora/cpu/docker/README.md +++ b/docker/llm/finetune/lora/cpu/docker/README.md @@ -3,7 +3,7 @@ You can download directly from Dockerhub like: ```bash -docker pull intelanalytics/bigdl-llm-finetune-cpu:2.4.0-SNAPSHOT +docker pull intelanalytics/bigdl-llm-finetune-lora-cpu:2.4.0-SNAPSHOT ``` Or build the image from source: @@ -15,6 +15,6 @@ export HTTPS_PROXY=your_https_proxy docker build \ --build-arg http_proxy=${HTTP_PROXY} \ --build-arg https_proxy=${HTTPS_PROXY} \ - -t intelanalytics/bigdl-llm-finetune-cpu:2.4.0-SNAPSHOT \ + -t intelanalytics/bigdl-llm-finetune-lora-cpu:2.4.0-SNAPSHOT \ -f ./Dockerfile . ``` diff --git a/docker/llm/finetune/lora/cpu/kubernetes/values.yaml b/docker/llm/finetune/lora/cpu/kubernetes/values.yaml index 92a5f5e0b1b..8c3b9db2706 100644 --- a/docker/llm/finetune/lora/cpu/kubernetes/values.yaml +++ b/docker/llm/finetune/lora/cpu/kubernetes/values.yaml @@ -1,4 +1,4 @@ -imageName: intelanalytics/bigdl-llm-finetune-cpu:2.4.0-SNAPSHOT +imageName: intelanalytics/bigdl-llm-finetune-lora-cpu:2.4.0-SNAPSHOT trainerNum: 8 microBatchSize: 8 nfsServerIp: your_nfs_server_ip diff --git a/docker/llm/finetune/qlora/xpu/docker/README.md b/docker/llm/finetune/qlora/xpu/docker/README.md index 201dadf29ad..368fd52f2e8 100644 --- a/docker/llm/finetune/qlora/xpu/docker/README.md +++ b/docker/llm/finetune/qlora/xpu/docker/README.md @@ -28,14 +28,18 @@ docker build \ Here, we try to fine-tune a [Llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b) with [English Quotes](https://huggingface.co/datasets/Abirate/english_quotes) dataset, and please download them and start a docker container with files mounted like below: ```bash -export BASE_MODE_PATH= -export DATA_PATH= +export BASE_MODE_PATH=your_downloaded_base_model_path +export DATA_PATH=your_downloaded_data_path +export HTTP_PROXY=your_http_proxy +export HTTPS_PROXY=your_https_proxy docker run -itd \ --net=host \ --device=/dev/dri \ --memory="32G" \ --name=bigdl-llm-fintune-qlora-xpu \ + -e http_proxy=${HTTP_PROXY} \ + -e https_proxy=${HTTPS_PROXY} \ -v $BASE_MODE_PATH:/model \ -v $DATA_PATH:/data/english_quotes \ --shm-size="16g" \ @@ -45,11 +49,16 @@ docker run -itd \ The download and mount of base model and data to a docker container demonstrates a standard fine-tuning process. You can skip this step for a quick start, and in this way, the fine-tuning codes will automatically download the needed files: ```bash +export HTTP_PROXY=your_http_proxy +export HTTPS_PROXY=your_https_proxy + docker run -itd \ --net=host \ --device=/dev/dri \ --memory="32G" \ --name=bigdl-llm-fintune-qlora-xpu \ + -e http_proxy=${HTTP_PROXY} \ + -e https_proxy=${HTTPS_PROXY} \ --shm-size="16g" \ intelanalytics/bigdl-llm-fintune-qlora-xpu:2.4.0-SNAPSHOT ``` diff --git a/docker/llm/inference/xpu/docker/Dockerfile b/docker/llm/inference/xpu/docker/Dockerfile index 0b7da551b12..92dc893bc2f 100644 --- a/docker/llm/inference/xpu/docker/Dockerfile +++ b/docker/llm/inference/xpu/docker/Dockerfile @@ -8,7 +8,9 @@ ENV TZ=Asia/Shanghai # Disable pip's cache behavior ARG PIP_NO_CACHE_DIR=false -RUN apt-get update && \ +RUN curl -fsSL https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \ + echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " > /etc/apt/sources.list.d/oneAPI.list && \ + apt-get update && \ apt-get install -y curl wget git gnupg gpg-agent && \ wget -qO - https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg && \ echo 'deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc' | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \ diff --git a/docker/llm/serving/cpu/kubernetes/README.md b/docker/llm/serving/cpu/kubernetes/README.md index b0027f127b2..d5394d2943d 100644 --- a/docker/llm/serving/cpu/kubernetes/README.md +++ b/docker/llm/serving/cpu/kubernetes/README.md @@ -15,6 +15,8 @@ After downloading the model, please change name from `vicuna-7b-v1.5` to `vicuna You can download the model from [here](https://huggingface.co/lmsys/vicuna-7b-v1.5). +For ChatGLM models, users do not need to add `bigdl` into model path. We have already used the `BigDL-LLM` backend for this model. + ### Kubernetes config We recommend to setup your kubernetes cluster before deployment. Mostly importantly, please set `cpu-management-policy` to `static` by using this [tutorial](https://kubernetes.io/docs/tasks/administer-cluster/cpu-management-policies/). Also, it would be great to also set the `topology management policy` to `single-numa-node`. diff --git a/ppml/tdx/docker/trusted-bigdl-llm/inference/requirements.txt b/ppml/tdx/docker/trusted-bigdl-llm/inference/requirements.txt index a162d91c0c6..9339e8b3146 100644 --- a/ppml/tdx/docker/trusted-bigdl-llm/inference/requirements.txt +++ b/ppml/tdx/docker/trusted-bigdl-llm/inference/requirements.txt @@ -14,7 +14,7 @@ fastapi==0.95.2 pydantic==1.10.8 ### document qa -langchain==0.0.246 +langchain==0.0.308 pypdf chromadb==0.3.25 diff --git a/python/llm/README.md b/python/llm/README.md index bb19f43b14e..63cd54fab9e 100644 --- a/python/llm/README.md +++ b/python/llm/README.md @@ -40,23 +40,24 @@ Over 20 models have been optimized/verified on `bigdl-llm`, including *LLaMA/LLa | Model | Example | |-----------|----------------------------------------------------------| -| LLaMA *(such as Vicuna, Guanaco, Koala, Baize, WizardLM, etc.)* | [link1](example/transformers/native_int4), [link2](example/transformers/transformers_int4/vicuna) | -| LLaMA 2 | [link](example/transformers/transformers_int4/llama2) | -| MPT | [link](example/transformers/transformers_int4/mpt) | -| Falcon | [link](example/transformers/transformers_int4/falcon) | -| ChatGLM | [link](example/transformers/transformers_int4/chatglm) | -| ChatGLM2 | [link](example/transformers/transformers_int4/chatglm2) | -| Qwen | [link](example/transformers/transformers_int4/qwen) | -| MOSS | [link](example/transformers/transformers_int4/moss) | -| Baichuan | [link](example/transformers/transformers_int4/baichuan) | -| Baichuan2 | [link](example/transformers/transformers_int4/baichuan2) | -| Dolly-v1 | [link](example/transformers/transformers_int4/dolly_v1) | -| Dolly-v2 | [link](example/transformers/transformers_int4/dolly_v2) | -| RedPajama | [link1](example/transformers/native_int4), [link2](example/transformers/transformers_int4/redpajama) | -| Phoenix | [link1](example/transformers/native_int4), [link2](example/transformers/transformers_int4/phoenix) | -| StarCoder | [link1](example/transformers/native_int4), [link2](example/transformers/transformers_int4/starcoder) | -| InternLM | [link](example/transformers/transformers_int4/internlm) | -| Whisper | [link](example/transformers/transformers_int4/whisper) | +| LLaMA *(such as Vicuna, Guanaco, Koala, Baize, WizardLM, etc.)* | [link1](example/CPU/Native-Models), [link2](example/CPU/HF-Transformers-AutoModels/Model/vicuna) | +| LLaMA 2 | [link](example/CPU/HF-Transformers-AutoModels/Model/llama2) | +| MPT | [link](example/CPU/HF-Transformers-AutoModels/Model/mpt) | +| Falcon | [link](example/CPU/HF-Transformers-AutoModels/Model/falcon) | +| ChatGLM | [link](example/CPU/HF-Transformers-AutoModels/Model/chatglm) | +| ChatGLM2 | [link](example/CPU/HF-Transformers-AutoModels/Model/chatglm2) | +| Qwen | [link](example/CPU/HF-Transformers-AutoModels/Model/qwen) | +| MOSS | [link](example/CPU/HF-Transformers-AutoModels/Model/moss) | +| Baichuan | [link](example/CPU/HF-Transformers-AutoModels/Model/baichuan) | +| Baichuan2 | [link](example/CPU/HF-Transformers-AutoModels/Model/baichuan2) | +| Dolly-v1 | [link](example/CPU/HF-Transformers-AutoModels/Model/dolly_v1) | +| Dolly-v2 | [link](example/CPU/HF-Transformers-AutoModels/Model/dolly_v2) | +| RedPajama | [link1](example/CPU/Native-Models), [link2](example/CPU/HF-Transformers-AutoModels/Model/redpajama) | +| Phoenix | [link1](example/CPU/Native-Models), [link2](example/CPU/HF-Transformers-AutoModels/Model/phoenix) | +| StarCoder | [link1](example/CPU/Native-Models), [link2](example/CPU/HF-Transformers-AutoModels/Model/starcoder) | +| InternLM | [link](example/CPU/HF-Transformers-AutoModels/Model/internlm) | +| Whisper | [link](example/CPU/HF-Transformers-AutoModels/Model/whisper) | +| Aquila | [link](example/CPU/HF-Transformers-AutoModels/Model/aquila) | @@ -119,7 +120,7 @@ output_ids = model.generate(input_ids, ...) output = tokenizer.batch_decode(output_ids) ``` -See the complete examples [here](example/transformers/transformers_int4/). +See the complete examples [here](example/CPU/HF-Transformers-AutoModels/Model/). ###### GPU INT4 You may apply INT4 optimizations to any Hugging Face *Transformers* model on Intel GPU as follows. @@ -138,7 +139,7 @@ input_ids = tokenizer.encode(input_str, ...).to('xpu') output_ids = model.generate(input_ids, ...) output = tokenizer.batch_decode(output_ids.cpu()) ``` -See the complete examples [here](example/gpu/). +See the complete examples [here](example/GPU). ###### More Low-Bit Support - Save and load @@ -148,7 +149,7 @@ See the complete examples [here](example/gpu/). model.save_low_bit(model_path) new_model = AutoModelForCausalLM.load_low_bit(model_path) ``` - *See the complete example [here](example/transformers/transformers_low_bit/).* + *See the complete example [here](example/CPU/HF-Transformers-AutoModels/Save-Load).* - Additonal data types @@ -157,7 +158,7 @@ See the complete examples [here](example/gpu/). ```python model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_low_bit="sym_int8") ``` - *See the complete example [here](example/transformers/transformers_low_bit/).* + *See the complete example [here](example/CPU/HF-Transformers-AutoModels/More-Data-Types).* ##### 2. Native INT4 model @@ -182,7 +183,7 @@ output_ids = llm.generate(input_ids, ...) output = llm.batch_decode(output_ids) ``` -See the complete example [here](example/transformers/native_int4/native_int4_pipeline.py). +See the complete example [here](example/CPU/Native-Models/native_int4_pipeline.py). ##### 3. LangChain API You may run the models using the LangChain API in `bigdl-llm`. @@ -202,7 +203,7 @@ You may run the models using the LangChain API in `bigdl-llm`. doc_chain = load_qa_chain(bigdl_llm, ...) output = doc_chain.run(...) ``` - See the examples [here](example/langchain/transformers_int4). + See the examples [here](example/CPU/LangChain/transformers_int4). - **Using native INT4 model** @@ -224,7 +225,7 @@ You may run the models using the LangChain API in `bigdl-llm`. doc_chain.run(...) ``` - See the examples [here](example/langchain/native_int4). + See the examples [here](example/CPU/LangChain/native_int4). ##### 4. CLI Tool >**Note**: Currently `bigdl-llm` CLI supports *LLaMA* (e.g., *vicuna*), *GPT-NeoX* (e.g., *redpajama*), *BLOOM* (e.g., *pheonix*) and *GPT2* (e.g., *starcoder*) model architecture; for other models, you may use the Hugging Face `transformers` or LangChain APIs. diff --git a/python/llm/dev/benchmark/run-benchmark-tests.sh b/python/llm/dev/benchmark/run-benchmark-tests.sh index 1fa8032e5b4..5ec5c489e18 100644 --- a/python/llm/dev/benchmark/run-benchmark-tests.sh +++ b/python/llm/dev/benchmark/run-benchmark-tests.sh @@ -12,11 +12,6 @@ export OMP_NUM_THREADS=$THREAD_NUM ######## LLAMA2 # transformers -if [ ! -d $ORIGINAL_LLAMA2_PATH ]; then - echo "Directory $ORIGINAL_LLAMA2_PATH not found. Downloading from FTP server..." - wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/${ORIGINAL_LLAMA2_PATH:2} -P $LLM_DIR -fi - echo ">>> Testing LLAMA2 transformers API" -taskset -c 0-$((THREAD_NUM - 1)) python python/llm/dev/benchmark/pipelines/llama2_test.py --repo-id-or-model-path $ORIGINAL_LLAMA2_PATH +taskset -c 0-$((THREAD_NUM - 1)) python python/llm/dev/benchmark/pipelines/llama2_test.py --repo-id-or-model-path $LLAMA2_7B_ORIGIN_PATH diff --git a/python/llm/example/transformers/transformers_int4/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/README.md similarity index 97% rename from python/llm/example/transformers/transformers_int4/README.md rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/README.md index 79e23fd1540..497e7e6c209 100644 --- a/python/llm/example/transformers/transformers_int4/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/README.md @@ -21,6 +21,7 @@ You can use BigDL-LLM to run any Huggingface Transformer models with INT4 optimi | InternLM | [link](internlm) | | Whisper | [link](whisper) | | Qwen | [link](qwen) | +| Aquila | [link](aquila) | ## Recommended Requirements To run the examples, we recommend using Intel® Xeon® processors (server), or >= 12th Gen Intel® Core™ processor (client). diff --git a/python/llm/example/transformers/transformers_int4/aquila/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/README.md similarity index 100% rename from python/llm/example/transformers/transformers_int4/aquila/README.md rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/README.md diff --git a/python/llm/example/transformers/transformers_int4/aquila/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/generate.py similarity index 100% rename from python/llm/example/transformers/transformers_int4/aquila/generate.py rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/generate.py diff --git a/python/llm/example/transformers/transformers_int4/baichuan/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/README.md similarity index 100% rename from python/llm/example/transformers/transformers_int4/baichuan/README.md rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/README.md diff --git a/python/llm/example/transformers/transformers_int4/baichuan/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/generate.py similarity index 100% rename from python/llm/example/transformers/transformers_int4/baichuan/generate.py rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/generate.py diff --git a/python/llm/example/transformers/transformers_int4/baichuan2/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/README.md similarity index 100% rename from python/llm/example/transformers/transformers_int4/baichuan2/README.md rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/README.md diff --git a/python/llm/example/transformers/transformers_int4/baichuan2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py similarity index 100% rename from python/llm/example/transformers/transformers_int4/baichuan2/generate.py rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py diff --git a/python/llm/example/transformers/transformers_int4/chatglm/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/README.md similarity index 100% rename from python/llm/example/transformers/transformers_int4/chatglm/README.md rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/README.md diff --git a/python/llm/example/transformers/transformers_int4/chatglm/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/generate.py similarity index 100% rename from python/llm/example/transformers/transformers_int4/chatglm/generate.py rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/generate.py diff --git a/python/llm/example/transformers/transformers_int4/chatglm2/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/README.md similarity index 100% rename from python/llm/example/transformers/transformers_int4/chatglm2/README.md rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/README.md diff --git a/python/llm/example/transformers/transformers_int4/chatglm2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py similarity index 100% rename from python/llm/example/transformers/transformers_int4/chatglm2/generate.py rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py diff --git a/python/llm/example/transformers/transformers_int4/chatglm2/streamchat.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py similarity index 100% rename from python/llm/example/transformers/transformers_int4/chatglm2/streamchat.py rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py diff --git a/python/llm/example/transformers/transformers_int4/dolly_v1/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/README.md similarity index 100% rename from python/llm/example/transformers/transformers_int4/dolly_v1/README.md rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/README.md diff --git a/python/llm/example/transformers/transformers_int4/dolly_v1/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/generate.py similarity index 100% rename from python/llm/example/transformers/transformers_int4/dolly_v1/generate.py rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/generate.py diff --git a/python/llm/example/transformers/transformers_int4/dolly_v2/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/README.md similarity index 100% rename from python/llm/example/transformers/transformers_int4/dolly_v2/README.md rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/README.md diff --git a/python/llm/example/transformers/transformers_int4/dolly_v2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/generate.py similarity index 100% rename from python/llm/example/transformers/transformers_int4/dolly_v2/generate.py rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/generate.py diff --git a/python/llm/example/transformers/transformers_int4/falcon/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/README.md similarity index 100% rename from python/llm/example/transformers/transformers_int4/falcon/README.md rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/README.md diff --git a/python/llm/example/transformers/transformers_int4/falcon/falcon-40b-instruct/modelling_RW.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/falcon-40b-instruct/modelling_RW.py similarity index 100% rename from python/llm/example/transformers/transformers_int4/falcon/falcon-40b-instruct/modelling_RW.py rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/falcon-40b-instruct/modelling_RW.py diff --git a/python/llm/example/gpu/hf-transformers-models/falcon/falcon-7b-instruct/modelling_RW.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/falcon-7b-instruct/modelling_RW.py similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/falcon/falcon-7b-instruct/modelling_RW.py rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/falcon-7b-instruct/modelling_RW.py diff --git a/python/llm/example/transformers/transformers_int4/falcon/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/generate.py similarity index 100% rename from python/llm/example/transformers/transformers_int4/falcon/generate.py rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/generate.py diff --git a/python/llm/example/transformers/transformers_int4/internlm/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/README.md similarity index 100% rename from python/llm/example/transformers/transformers_int4/internlm/README.md rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/README.md diff --git a/python/llm/example/transformers/transformers_int4/internlm/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/generate.py similarity index 100% rename from python/llm/example/transformers/transformers_int4/internlm/generate.py rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/generate.py diff --git a/python/llm/example/transformers/transformers_int4/llama2/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/README.md similarity index 100% rename from python/llm/example/transformers/transformers_int4/llama2/README.md rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/README.md diff --git a/python/llm/example/transformers/transformers_int4/llama2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/generate.py similarity index 100% rename from python/llm/example/transformers/transformers_int4/llama2/generate.py rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/generate.py diff --git a/python/llm/example/transformers/transformers_int4/moss/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/README.md similarity index 100% rename from python/llm/example/transformers/transformers_int4/moss/README.md rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/README.md diff --git a/python/llm/example/transformers/transformers_int4/moss/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/generate.py similarity index 100% rename from python/llm/example/transformers/transformers_int4/moss/generate.py rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/generate.py diff --git a/python/llm/example/transformers/transformers_int4/mpt/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/README.md similarity index 100% rename from python/llm/example/transformers/transformers_int4/mpt/README.md rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/README.md diff --git a/python/llm/example/transformers/transformers_int4/mpt/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/generate.py similarity index 100% rename from python/llm/example/transformers/transformers_int4/mpt/generate.py rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/generate.py diff --git a/python/llm/example/transformers/transformers_int4/phoenix/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/README.md similarity index 100% rename from python/llm/example/transformers/transformers_int4/phoenix/README.md rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/README.md diff --git a/python/llm/example/transformers/transformers_int4/phoenix/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/generate.py similarity index 100% rename from python/llm/example/transformers/transformers_int4/phoenix/generate.py rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/generate.py diff --git a/python/llm/example/transformers/transformers_int4/qwen/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/README.md similarity index 100% rename from python/llm/example/transformers/transformers_int4/qwen/README.md rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/README.md diff --git a/python/llm/example/transformers/transformers_int4/qwen/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/generate.py similarity index 100% rename from python/llm/example/transformers/transformers_int4/qwen/generate.py rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/generate.py diff --git a/python/llm/example/transformers/transformers_int4/redpajama/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/README.md similarity index 100% rename from python/llm/example/transformers/transformers_int4/redpajama/README.md rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/README.md diff --git a/python/llm/example/transformers/transformers_int4/redpajama/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/generate.py similarity index 100% rename from python/llm/example/transformers/transformers_int4/redpajama/generate.py rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/redpajama/generate.py diff --git a/python/llm/example/transformers/transformers_int4/starcoder/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/README.md similarity index 100% rename from python/llm/example/transformers/transformers_int4/starcoder/README.md rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/README.md diff --git a/python/llm/example/transformers/transformers_int4/starcoder/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/generate.py similarity index 100% rename from python/llm/example/transformers/transformers_int4/starcoder/generate.py rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/starcoder/generate.py diff --git a/python/llm/example/transformers/transformers_int4/vicuna/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/README.md similarity index 100% rename from python/llm/example/transformers/transformers_int4/vicuna/README.md rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/README.md diff --git a/python/llm/example/transformers/transformers_int4/vicuna/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/generate.py similarity index 100% rename from python/llm/example/transformers/transformers_int4/vicuna/generate.py rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/vicuna/generate.py diff --git a/python/llm/example/transformers/transformers_int4/whisper/long-segment-recognize.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/long-segment-recognize.py similarity index 100% rename from python/llm/example/transformers/transformers_int4/whisper/long-segment-recognize.py rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/long-segment-recognize.py diff --git a/python/llm/example/transformers/transformers_int4/whisper/readme.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/readme.md similarity index 100% rename from python/llm/example/transformers/transformers_int4/whisper/readme.md rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/readme.md diff --git a/python/llm/example/transformers/transformers_int4/whisper/recognize.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/recognize.py similarity index 100% rename from python/llm/example/transformers/transformers_int4/whisper/recognize.py rename to python/llm/example/CPU/HF-Transformers-AutoModels/Model/whisper/recognize.py diff --git a/python/llm/example/transformers/transformers_low_bit/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/More-Data-Types/README.md similarity index 100% rename from python/llm/example/transformers/transformers_low_bit/README.md rename to python/llm/example/CPU/HF-Transformers-AutoModels/More-Data-Types/README.md diff --git a/python/llm/example/transformers/transformers_low_bit/transformers_low_bit_pipeline.py b/python/llm/example/CPU/HF-Transformers-AutoModels/More-Data-Types/transformers_low_bit_pipeline.py similarity index 100% rename from python/llm/example/transformers/transformers_low_bit/transformers_low_bit_pipeline.py rename to python/llm/example/CPU/HF-Transformers-AutoModels/More-Data-Types/transformers_low_bit_pipeline.py diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/README.md new file mode 100644 index 00000000000..e0cebde5b45 --- /dev/null +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/README.md @@ -0,0 +1,7 @@ +# Running Hugging Face Transformers model using BigDL-LLM on Intel CPU + +This folder contains examples of running any Hugging Face Transformers model on BigDL-LLM (using the standard AutoModel APIs): + +- [Model](Model): examples of running Hugging Face Transformers models (e.g., LLaMA2, ChatGLM2, Falcon, MPT, Baichuan2, etc.) using INT4 optimizations +- [More-Data-Types](More-Data-Types): examples of applying other low bit optimizations (NF4/INT5/INT8, etc.) +- [Save-Load](Save-Load): examples of saving and loading low-bit models diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load/README.md new file mode 100644 index 00000000000..6a992c857a9 --- /dev/null +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load/README.md @@ -0,0 +1,43 @@ +# BigDL-LLM Transformers Low-Bit Inference Pipeline for Large Language Model + +In this example, we show a pipeline to apply BigDL-LLM low-bit optimizations (including INT8/INT5/INT4) to any Hugging Face Transformers model, and then run inference on the optimized low-bit model. + +## Prepare Environment +We suggest using conda to manage environment: +```bash +conda create -n llm python=3.9 +conda activate llm + +pip install --pre --upgrade bigdl-llm[all] +``` + +## Run Example +```bash +python ./transformers_low_bit_pipeline.py --repo-id-or-model-path decapoda-research/llama-7b-hf --low-bit sym_int5 --save-path ./llama-7b-sym_int5 +``` +arguments info: +- `--repo-id-or-model-path`: str value, argument defining the huggingface repo id for the large language model to be downloaded, or the path to the huggingface checkpoint folder, the value is 'decapoda-research/llama-7b-hf' by default. +- `--low-bit`: str value, options are sym_int4, asym_int4, sym_int5, asym_int5 or sym_int8. (sym_int4 means symmetric int 4, asym_int4 means asymmetric int 4, etc.). Relevant low bit optimizations will be applied to the model. +- `--save-path`: str value, the path to save the low-bit model. Then you can load the low-bit directly. +- `--load-path`: optional str value. The path to load low-bit model. + + +## Sample Output for Inference +### 'decapoda-research/llama-7b-hf' Model +```log +Prompt: Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun +Output: Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun. She wanted to be a princess, and she wanted to be a pirate. She wanted to be a superhero, and she wanted to be +Model and tokenizer are saved to ./llama-7b-sym_int5 +``` + +### Load low-bit model +Command to run: +```bash +python ./transformers_low_bit_pipeline.py --load-path ./llama-7b-sym_int5 +``` +Output log: +```log +Prompt: Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun +Output: Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun. She wanted to be a princess, and she wanted to be a pirate. She wanted to be a superhero, and she wanted to be +``` + diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load/transformers_low_bit_pipeline.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load/transformers_low_bit_pipeline.py new file mode 100644 index 00000000000..9cf9cffb878 --- /dev/null +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Save-Load/transformers_low_bit_pipeline.py @@ -0,0 +1,56 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +from bigdl.llm.transformers import AutoModelForCausalLM +from transformers import LlamaTokenizer, TextGenerationPipeline + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Transformer save_load example') + parser.add_argument('--repo-id-or-model-path', type=str, default="decapoda-research/llama-7b-hf", + help='The huggingface repo id for the large language model to be downloaded' + ', or the path to the huggingface checkpoint folder') + parser.add_argument('--low-bit', type=str, default="sym_int4", + choices=['sym_int4', 'asym_int4', 'sym_int5', 'asym_int5', 'sym_int8'], + help='The quantization type the model will convert to.') + parser.add_argument('--save-path', type=str, default=None, + help='The path to save the low-bit model.') + parser.add_argument('--load-path', type=str, default=None, + help='The path to load the low-bit model.') + args = parser.parse_args() + model_path = args.repo_id_or_model_path + low_bit = args.low_bit + load_path = args.load_path + if load_path: + model = AutoModelForCausalLM.load_low_bit(load_path) + tokenizer = LlamaTokenizer.from_pretrained(load_path) + else: + # load_in_low_bit in bigdl.llm.transformers will convert + # the relevant layers in the model into corresponding int X format + model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True) + tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True) + + pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer, max_new_tokens=32) + input_str = "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun" + output = pipeline(input_str)[0]["generated_text"] + print(f"Prompt: {input_str}") + print(f"Output: {output}") + + save_path = args.save_path + if save_path: + model.save_low_bit(save_path) + tokenizer.save_pretrained(save_path) + print(f"Model and tokenizer are saved to {save_path}") diff --git a/python/llm/example/langchain/README.md b/python/llm/example/CPU/LangChain/README.md similarity index 100% rename from python/llm/example/langchain/README.md rename to python/llm/example/CPU/LangChain/README.md diff --git a/python/llm/example/langchain/native_int4/docqa.py b/python/llm/example/CPU/LangChain/native_int4/docqa.py similarity index 100% rename from python/llm/example/langchain/native_int4/docqa.py rename to python/llm/example/CPU/LangChain/native_int4/docqa.py diff --git a/python/llm/example/langchain/native_int4/streamchat.py b/python/llm/example/CPU/LangChain/native_int4/streamchat.py similarity index 100% rename from python/llm/example/langchain/native_int4/streamchat.py rename to python/llm/example/CPU/LangChain/native_int4/streamchat.py diff --git a/python/llm/example/langchain/native_int4/voiceassistant.py b/python/llm/example/CPU/LangChain/native_int4/voiceassistant.py similarity index 100% rename from python/llm/example/langchain/native_int4/voiceassistant.py rename to python/llm/example/CPU/LangChain/native_int4/voiceassistant.py diff --git a/python/llm/example/langchain/transformers_int4/chat.py b/python/llm/example/CPU/LangChain/transformers_int4/chat.py similarity index 100% rename from python/llm/example/langchain/transformers_int4/chat.py rename to python/llm/example/CPU/LangChain/transformers_int4/chat.py diff --git a/python/llm/example/langchain/transformers_int4/docqa.py b/python/llm/example/CPU/LangChain/transformers_int4/docqa.py similarity index 100% rename from python/llm/example/langchain/transformers_int4/docqa.py rename to python/llm/example/CPU/LangChain/transformers_int4/docqa.py diff --git a/python/llm/example/langchain/transformers_int4/llm_math.py b/python/llm/example/CPU/LangChain/transformers_int4/llm_math.py similarity index 100% rename from python/llm/example/langchain/transformers_int4/llm_math.py rename to python/llm/example/CPU/LangChain/transformers_int4/llm_math.py diff --git a/python/llm/example/langchain/transformers_int4/voiceassistant.py b/python/llm/example/CPU/LangChain/transformers_int4/voiceassistant.py similarity index 100% rename from python/llm/example/langchain/transformers_int4/voiceassistant.py rename to python/llm/example/CPU/LangChain/transformers_int4/voiceassistant.py diff --git a/python/llm/example/transformers/native_int4/README.md b/python/llm/example/CPU/Native-Models/README.md similarity index 100% rename from python/llm/example/transformers/native_int4/README.md rename to python/llm/example/CPU/Native-Models/README.md diff --git a/python/llm/example/transformers/native_int4/native_int4_pipeline.py b/python/llm/example/CPU/Native-Models/native_int4_pipeline.py similarity index 100% rename from python/llm/example/transformers/native_int4/native_int4_pipeline.py rename to python/llm/example/CPU/Native-Models/native_int4_pipeline.py diff --git a/python/llm/example/pytorch-models/README.md b/python/llm/example/CPU/PyTorch-Models/Model/README.md similarity index 100% rename from python/llm/example/pytorch-models/README.md rename to python/llm/example/CPU/PyTorch-Models/Model/README.md diff --git a/python/llm/example/pytorch-models/bark/README.md b/python/llm/example/CPU/PyTorch-Models/Model/bark/README.md similarity index 100% rename from python/llm/example/pytorch-models/bark/README.md rename to python/llm/example/CPU/PyTorch-Models/Model/bark/README.md diff --git a/python/llm/example/pytorch-models/bark/synthesize_speech.py b/python/llm/example/CPU/PyTorch-Models/Model/bark/synthesize_speech.py similarity index 100% rename from python/llm/example/pytorch-models/bark/synthesize_speech.py rename to python/llm/example/CPU/PyTorch-Models/Model/bark/synthesize_speech.py diff --git a/python/llm/example/pytorch-models/bert/README.md b/python/llm/example/CPU/PyTorch-Models/Model/bert/README.md similarity index 100% rename from python/llm/example/pytorch-models/bert/README.md rename to python/llm/example/CPU/PyTorch-Models/Model/bert/README.md diff --git a/python/llm/example/pytorch-models/bert/extract_feature.py b/python/llm/example/CPU/PyTorch-Models/Model/bert/extract_feature.py similarity index 100% rename from python/llm/example/pytorch-models/bert/extract_feature.py rename to python/llm/example/CPU/PyTorch-Models/Model/bert/extract_feature.py diff --git a/python/llm/example/pytorch-models/chatglm/README.md b/python/llm/example/CPU/PyTorch-Models/Model/chatglm/README.md similarity index 100% rename from python/llm/example/pytorch-models/chatglm/README.md rename to python/llm/example/CPU/PyTorch-Models/Model/chatglm/README.md diff --git a/python/llm/example/pytorch-models/chatglm/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/chatglm/generate.py similarity index 100% rename from python/llm/example/pytorch-models/chatglm/generate.py rename to python/llm/example/CPU/PyTorch-Models/Model/chatglm/generate.py diff --git a/python/llm/example/pytorch-models/llama2/README.md b/python/llm/example/CPU/PyTorch-Models/Model/llama2/README.md similarity index 100% rename from python/llm/example/pytorch-models/llama2/README.md rename to python/llm/example/CPU/PyTorch-Models/Model/llama2/README.md diff --git a/python/llm/example/pytorch-models/llama2/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/llama2/generate.py similarity index 100% rename from python/llm/example/pytorch-models/llama2/generate.py rename to python/llm/example/CPU/PyTorch-Models/Model/llama2/generate.py diff --git a/python/llm/example/pytorch-models/openai-whisper/readme.md b/python/llm/example/CPU/PyTorch-Models/Model/openai-whisper/readme.md similarity index 100% rename from python/llm/example/pytorch-models/openai-whisper/readme.md rename to python/llm/example/CPU/PyTorch-Models/Model/openai-whisper/readme.md diff --git a/python/llm/example/pytorch-models/openai-whisper/recognize.py b/python/llm/example/CPU/PyTorch-Models/Model/openai-whisper/recognize.py similarity index 100% rename from python/llm/example/pytorch-models/openai-whisper/recognize.py rename to python/llm/example/CPU/PyTorch-Models/Model/openai-whisper/recognize.py diff --git a/python/llm/example/CPU/PyTorch-Models/More-Data-Types/.keep b/python/llm/example/CPU/PyTorch-Models/More-Data-Types/.keep new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/llm/example/CPU/PyTorch-Models/README.md b/python/llm/example/CPU/PyTorch-Models/README.md new file mode 100644 index 00000000000..06860d4563d --- /dev/null +++ b/python/llm/example/CPU/PyTorch-Models/README.md @@ -0,0 +1,7 @@ +# Running PyTorch model using BigDL-LLM on Intel CPU + +This folder contains examples of running any PyTorch model on BigDL-LLM (with "one-line code change"): + +- [Model](Model): examples of running PyTorch models (e.g., Openai Whisper, LLaMA2, ChatGLM2, Falcon, MPT, Baichuan2, etc.) using INT4 optimizations +- [More-Data-Types](More-Data-Types): examples of applying other low bit optimizations (NF4/INT5/INT8, etc.) +- [Save-Load](Save-Load): examples of saving and loading low-bit models diff --git a/python/llm/example/CPU/PyTorch-Models/Save-Load/.keep b/python/llm/example/CPU/PyTorch-Models/Save-Load/.keep new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/llm/example/CPU/README.md b/python/llm/example/CPU/README.md new file mode 100644 index 00000000000..1344cbb6dcb --- /dev/null +++ b/python/llm/example/CPU/README.md @@ -0,0 +1,18 @@ +# BigDL-LLM Examples on Intel CPU + +This folder contains examples of running BigDL-LLM on Intel CPU: + +- [HF-Transformers-AutoModels](HF-Transformers-AutoModels): running any Hugging Face Transformers model on BigDL-LLM (using the standard AutoModel APIs) +- [PyTorch-Models](PyTorch-Models): running any PyTorch model on BigDL-LLM (with "one-line code change") +- [Native-Models](Native-Models): converting & running LLM in `llama`/`chatglm`/`bloom`/`gptneox`/`starcoder` model family using native (cpp) implementation +- [LangChain](LangChain): running LangChain applications on BigDL-LLM + +## System Support +**Hardware**: +- Intel® Core™ processors +- Intel® Xeon® processors + +**Operating System**: +- Ubuntu 20.04 or later +- CentOS 7 or later +- Windows 10/11, with or without WSL diff --git a/python/llm/example/gpu/hf-transformers-models/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/README.md similarity index 98% rename from python/llm/example/gpu/hf-transformers-models/README.md rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/README.md index 0798745b952..a2164718533 100644 --- a/python/llm/example/gpu/hf-transformers-models/README.md +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/README.md @@ -21,6 +21,7 @@ You can use BigDL-LLM to run almost every Huggingface Transformer models with IN - Intel Arc™ A-Series Graphics - Intel Data Center GPU Flex Series +- Intel Data Center GPU Max Series ## Recommended Requirements To apply Intel GPU acceleration, there’re several steps for tools installation and environment preparation. diff --git a/python/llm/example/gpu/hf-transformers-models/baichuan/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/README.md similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/baichuan/README.md rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/README.md diff --git a/python/llm/example/gpu/hf-transformers-models/baichuan/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/generate.py similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/baichuan/generate.py rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan/generate.py diff --git a/python/llm/example/gpu/hf-transformers-models/baichuan2/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/README.md similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/baichuan2/README.md rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/README.md diff --git a/python/llm/example/gpu/hf-transformers-models/baichuan2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/baichuan2/generate.py rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py diff --git a/python/llm/example/gpu/hf-transformers-models/chatglm2/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/README.md similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/chatglm2/README.md rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/README.md diff --git a/python/llm/example/gpu/hf-transformers-models/chatglm2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/chatglm2/generate.py rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py diff --git a/python/llm/example/gpu/hf-transformers-models/chatglm2/streamchat.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/chatglm2/streamchat.py rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/chatglm2/streamchat.py diff --git a/python/llm/example/gpu/hf-transformers-models/chinese-llama2/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/README.md similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/chinese-llama2/README.md rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/README.md diff --git a/python/llm/example/gpu/hf-transformers-models/chinese-llama2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/generate.py similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/chinese-llama2/generate.py rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/chinese-llama2/generate.py diff --git a/python/llm/example/gpu/hf-transformers-models/falcon/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/README.md similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/falcon/README.md rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/README.md diff --git a/python/llm/example/transformers/transformers_int4/falcon/falcon-7b-instruct/modelling_RW.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/falcon-7b-instruct/modelling_RW.py similarity index 100% rename from python/llm/example/transformers/transformers_int4/falcon/falcon-7b-instruct/modelling_RW.py rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/falcon-7b-instruct/modelling_RW.py diff --git a/python/llm/example/gpu/hf-transformers-models/falcon/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/generate.py similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/falcon/generate.py rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/falcon/generate.py diff --git a/python/llm/example/gpu/hf-transformers-models/gpt-j/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/generate.py similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/gpt-j/generate.py rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/generate.py diff --git a/python/llm/example/gpu/hf-transformers-models/gpt-j/readme.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/readme.md similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/gpt-j/readme.md rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/gpt-j/readme.md diff --git a/python/llm/example/gpu/hf-transformers-models/internlm/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/README.md similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/internlm/README.md rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/README.md diff --git a/python/llm/example/gpu/hf-transformers-models/internlm/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/generate.py similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/internlm/generate.py rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm/generate.py diff --git a/python/llm/example/gpu/hf-transformers-models/llama2/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/README.md similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/llama2/README.md rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/README.md diff --git a/python/llm/example/gpu/hf-transformers-models/llama2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/llama2/generate.py rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/llama2/generate.py diff --git a/python/llm/example/gpu/hf-transformers-models/mpt/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/README.md similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/mpt/README.md rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/README.md diff --git a/python/llm/example/gpu/hf-transformers-models/mpt/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/generate.py similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/mpt/generate.py rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/mpt/generate.py diff --git a/python/llm/example/gpu/hf-transformers-models/qwen/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/README.md similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/qwen/README.md rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/README.md diff --git a/python/llm/example/gpu/hf-transformers-models/qwen/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/qwen/generate.py rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen/generate.py diff --git a/python/llm/example/gpu/hf-transformers-models/starcoder/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/generate.py similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/starcoder/generate.py rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/generate.py diff --git a/python/llm/example/gpu/hf-transformers-models/starcoder/readme.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/readme.md similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/starcoder/readme.md rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/starcoder/readme.md diff --git a/python/llm/example/gpu/hf-transformers-models/voiceassistant/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/voiceassistant/README.md similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/voiceassistant/README.md rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/voiceassistant/README.md diff --git a/python/llm/example/gpu/hf-transformers-models/voiceassistant/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/voiceassistant/generate.py similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/voiceassistant/generate.py rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/voiceassistant/generate.py diff --git a/python/llm/example/gpu/hf-transformers-models/whisper/readme.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/whisper/readme.md similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/whisper/readme.md rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/whisper/readme.md diff --git a/python/llm/example/gpu/hf-transformers-models/whisper/recognize.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/whisper/recognize.py similarity index 100% rename from python/llm/example/gpu/hf-transformers-models/whisper/recognize.py rename to python/llm/example/GPU/HF-Transformers-AutoModels/Model/whisper/recognize.py diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/More-Data-Types/.keep b/python/llm/example/GPU/HF-Transformers-AutoModels/More-Data-Types/.keep new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/README.md new file mode 100644 index 00000000000..da1a13d6fb3 --- /dev/null +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/README.md @@ -0,0 +1,7 @@ +# Running Hugging Face Transformers model using BigDL-LLM on Intel GPU + +This folder contains examples of running any Hugging Face Transformers model on BigDL-LLM (using the standard AutoModel APIs): + +- [Model](Model): examples of running Hugging Face Transformers models (e.g., LLaMA2, ChatGLM2, Falcon, MPT, Baichuan2, etc.) using INT4 optimizations +- [More-Data-Types](More-Data-Types): examples of applying other low bit optimizations (NF4/INT5/INT8, etc.) +- [Save-Load](Save-Load): examples of saving and loading low-bit models diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Save-Load/.keep b/python/llm/example/GPU/HF-Transformers-AutoModels/Save-Load/.keep new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/llm/example/GPU/PyTorch-Models/Model/.keep b/python/llm/example/GPU/PyTorch-Models/Model/.keep new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/llm/example/GPU/PyTorch-Models/More-Data-Types/.keep b/python/llm/example/GPU/PyTorch-Models/More-Data-Types/.keep new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/llm/example/GPU/PyTorch-Models/README.md b/python/llm/example/GPU/PyTorch-Models/README.md new file mode 100644 index 00000000000..ce5cd50efdf --- /dev/null +++ b/python/llm/example/GPU/PyTorch-Models/README.md @@ -0,0 +1,7 @@ +# Running PyTorch model using BigDL-LLM on Intel GPU + +This folder contains examples of running any PyTorch model on BigDL-LLM (with "one-line code change"): + +- [Model](Model): examples of running PyTorch models (e.g., Openai Whisper, LLaMA2, ChatGLM2, Falcon, MPT, Baichuan2, etc.) using INT4 optimizations +- [More-Data-Types](More-Data-Types): examples of applying other low bit optimizations (NF4/INT5/INT8, etc.) +- [Save-Load](Save-Load): examples of saving and loading low-bit models diff --git a/python/llm/example/GPU/PyTorch-Models/Save-Load/.keep b/python/llm/example/GPU/PyTorch-Models/Save-Load/.keep new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/llm/example/gpu/qlora_finetuning/README.md b/python/llm/example/GPU/QLoRA-FineTuning/README.md similarity index 100% rename from python/llm/example/gpu/qlora_finetuning/README.md rename to python/llm/example/GPU/QLoRA-FineTuning/README.md diff --git a/python/llm/example/gpu/qlora_finetuning/export_merged_model.py b/python/llm/example/GPU/QLoRA-FineTuning/export_merged_model.py similarity index 100% rename from python/llm/example/gpu/qlora_finetuning/export_merged_model.py rename to python/llm/example/GPU/QLoRA-FineTuning/export_merged_model.py diff --git a/python/llm/example/gpu/qlora_finetuning/qlora_finetuning.py b/python/llm/example/GPU/QLoRA-FineTuning/qlora_finetuning.py similarity index 100% rename from python/llm/example/gpu/qlora_finetuning/qlora_finetuning.py rename to python/llm/example/GPU/QLoRA-FineTuning/qlora_finetuning.py diff --git a/python/llm/example/GPU/README.md b/python/llm/example/GPU/README.md new file mode 100644 index 00000000000..8cb7c7211b3 --- /dev/null +++ b/python/llm/example/GPU/README.md @@ -0,0 +1,26 @@ +# BigDL-LLM Examples on Intel GPU + +This folder contains examples of running BigDL-LLM on Intel GPU: + +- [HF-Transformers-AutoModels](HF-Transformers-AutoModels): running any Hugging Face Transformers model on BigDL-LLM (using the standard AutoModel APIs) +- [PyTorch-Models](PyTorch-Models): running any PyTorch model on BigDL-LLM (with "one-line code change") +- [QLoRA-FineTuning](QLoRA-FineTuning): running QLoRA finetuning on BigDL-LLM + + +## System Support +**Hardware**: +- Intel Arc™ A-Series Graphics +- Intel Data Center GPU Flex Series +- Intel Data Center GPU Max Series + +**Operating System**: +- Ubuntu 20.04 or later (Ubuntu 22.04 is preferred) + +## Requirements +To apply Intel GPU acceleration, there’re several steps for tools installation and environment preparation. + +Step 1, please refer to our [driver installation](https://dgpu-docs.intel.com/driver/installation.html) for general purpose GPU capabilities. +> **Note**: IPEX 2.0.110+xpu requires Intel GPU Driver version is [Stable 647.21](https://dgpu-docs.intel.com/releases/stable_647_21_20230714.html). + +Step 2, you also need to download and install [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html). OneMKL and DPC++ compiler are needed, others are optional. +> **Note**: IPEX 2.0.110+xpu requires Intel® oneAPI Base Toolkit's version >= 2023.2.0. diff --git a/python/llm/example/cpp-python/README.md b/python/llm/example/cpp-python/README.md deleted file mode 100644 index 60d51707a7a..00000000000 --- a/python/llm/example/cpp-python/README.md +++ /dev/null @@ -1,28 +0,0 @@ -# BigDL-LLM INT4 Inference Using Llama-Cpp-Python Format API - -In this example, we show how to run inference on converted INT4 model using llama-cpp-python format API. - -> **Note**: Currently model family LLaMA, GPT-NeoX, BLOOM and StarCoder are supported. - -## Prepare Environment -We suggest using conda to manage environment: -```bash -conda create -n llm python=3.9 -conda activate llm - -pip install --pre --upgrade bigdl-llm[all] -``` - -## Convert Models using bigdl-llm -Follow the instructions in [Convert model](https://github.com/intel-analytics/BigDL/tree/main/python/llm#convert-model). - - -## Run the example -```bash -python ./int4_inference.py -m CONVERTED_MODEL_PATH -x MODEL_FAMILY -p PROMPT -t THREAD_NUM -``` -arguments info: -- `-m CONVERTED_MODEL_PATH`: **required**, path to the converted model -- `-x MODEL_FAMILY`: **required**, the model family of the model specified in `-m`, available options are `llama`, `gptneox`, `bloom` and `starcoder` -- `-p PROMPT`: question to ask. Default is `What is AI?`. -- `-t THREAD_NUM`: specify the number of threads to use for inference. Default is `2`. diff --git a/python/llm/example/cpp-python/int4_inference.py b/python/llm/example/cpp-python/int4_inference.py deleted file mode 100644 index b7edcb68924..00000000000 --- a/python/llm/example/cpp-python/int4_inference.py +++ /dev/null @@ -1,60 +0,0 @@ -# -# Copyright 2016 The BigDL Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# This would makes sure Python is aware there is more than one sub-package within bigdl, -# physically located elsewhere. -# Otherwise there would be module not found error in non-pip's setting as Python would -# only search the first bigdl package and end up finding only one sub-package. - -import argparse - -def main(args): - model_family = args.model_family - model_path = args.model_path - prompt = args.prompt - n_threads = args.thread_num - - if model_family == "llama": - from bigdl.llm.models import Llama - modelclass = Llama - if model_family == "bloom": - from bigdl.llm.models import Bloom - modelclass = Bloom - if model_family == "gptneox": - from bigdl.llm.models import Gptneox - modelclass = Gptneox - if model_family == "starcoder": - from bigdl.llm.models import Starcoder - modelclass = Starcoder - - model = modelclass(model_path, n_threads=n_threads) - response=model(prompt) - print(response) - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Llama-CPP-Python style API Simple Example') - parser.add_argument('-x','--model-family', type=str, required=True, - choices=["llama", "bloom", "gptneox", "starcoder"], - help='the model family') - parser.add_argument('-m','--model-path', type=str, required=True, - help='the path to the converted llm model') - parser.add_argument('-p', '--prompt', type=str, default='What is AI?', - help='qustion you want to ask.') - parser.add_argument('-t','--thread-num', type=int, default=2, - help='number of threads to use for inference') - args = parser.parse_args() - - main(args) diff --git a/python/llm/example/gpu/README.md b/python/llm/example/gpu/README.md deleted file mode 100644 index 1abff7e56cd..00000000000 --- a/python/llm/example/gpu/README.md +++ /dev/null @@ -1,41 +0,0 @@ -# BigDL-LLM INT4 Optimization for Large Language Model on Intel GPUs -You can use BigDL-LLM to run almost every Huggingface Transformer models with INT4 optimizations on your laptops with Intel GPUs. Moreover, you can also use `optimize_model` API to accelerate general PyTorch models on Intel GPUs. - -## Verified models -| Model | Example | -|------------|----------------------------------------------------------| -| Baichuan | [link](hf-transformers-models/baichuan) | -| Baichuan2 | [link](hf-transformers-models/baichuan2) | -| ChatGLM2 | [link](hf-transformers-models/chatglm2) | -| Chinese Llama2 | [link](hf-transformers-models/chinese-llama2)| -| Falcon | [link](hf-transformers-models/falcon) | -| GPT-J | [link](hf-transformers-models/gpt-j) | -| InternLM | [link](hf-transformers-models/internlm) | -| LLaMA 2 | [link](hf-transformers-models/llama2) | -| MPT | [link](hf-transformers-models/mpt) | -| Qwen | [link](hf-transformers-models/qwen) | -| StarCoder | [link](hf-transformers-models/starcoder) | -| Whisper | [link](hf-transformers-models/whisper) | - -## Verified Hardware Platforms - -- Intel Arc™ A-Series Graphics -- Intel Data Center GPU Flex Series - -## Recommended Requirements -To apply Intel GPU acceleration, there’re several steps for tools installation and environment preparation. - -Step 1, only Linux system is supported now, Ubuntu 22.04 is prefered. - -Step 2, please refer to our [driver installation](https://dgpu-docs.intel.com/driver/installation.html) for general purpose GPU capabilities. -> **Note**: IPEX 2.0.110+xpu requires Intel GPU Driver version is [Stable 647.21](https://dgpu-docs.intel.com/releases/stable_647_21_20230714.html). - -Step 3, you also need to download and install [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html). OneMKL and DPC++ compiler are needed, others are optional. -> **Note**: IPEX 2.0.110+xpu requires Intel® oneAPI Base Toolkit's version >= 2023.2.0. - -## Best Known Configuration on Linux -For better performance, it is recommended to set environment variables on Linux: -```bash -export USE_XETLA=OFF -export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 -``` diff --git a/python/llm/example/gpu/pytorch-models/README.md b/python/llm/example/gpu/pytorch-models/README.md deleted file mode 100644 index 6c958e7a968..00000000000 --- a/python/llm/example/gpu/pytorch-models/README.md +++ /dev/null @@ -1,25 +0,0 @@ -# BigDL-LLM INT4 Optimization for Large Language Model on Intel GPUs -You can use `optimize_model` API to accelerate general PyTorch models on Intel servers and PCs. This directory contains example scripts to help you quickly get started using BigDL-LLM to run some popular open-source models in the community. Each model has its own dedicated folder, where you can find detailed instructions on how to install and run it. - -## Verified Hardware Platforms - -- Intel Arc™ A-Series Graphics -- Intel Data Center GPU Flex Series - -## Recommended Requirements -To apply Intel GPU acceleration, there’re several steps for tools installation and environment preparation. - -Step 1, only Linux system is supported now, Ubuntu 22.04 is prefered. - -Step 2, please refer to our [driver installation](https://dgpu-docs.intel.com/driver/installation.html) for general purpose GPU capabilities. -> **Note**: IPEX 2.0.110+xpu requires Intel GPU Driver version is [Stable 647.21](https://dgpu-docs.intel.com/releases/stable_647_21_20230714.html). - -Step 3, you also need to download and install [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html). OneMKL and DPC++ compiler are needed, others are optional. -> **Note**: IPEX 2.0.110+xpu requires Intel® oneAPI Base Toolkit's version >= 2023.2.0. - -## Best Known Configuration on Linux -For better performance, it is recommended to set environment variables on Linux: -```bash -export USE_XETLA=OFF -export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 -``` diff --git a/python/llm/example/transformers/transformers_int4/GPU/README.md b/python/llm/example/transformers/transformers_int4/GPU/README.md deleted file mode 100644 index f12e7824f0d..00000000000 --- a/python/llm/example/transformers/transformers_int4/GPU/README.md +++ /dev/null @@ -1 +0,0 @@ -### The GPU examples for `bigdl-llm` have been moved to [here](../../../gpu). diff --git a/python/llm/portable-executable/.gitignore b/python/llm/portable-executable/.gitignore deleted file mode 100644 index 23c0161c874..00000000000 --- a/python/llm/portable-executable/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -python-embed -portable-executable.zip \ No newline at end of file diff --git a/python/llm/portable-executable/README.md b/python/llm/portable-executable/README.md deleted file mode 100644 index 0f1df88f1f4..00000000000 --- a/python/llm/portable-executable/README.md +++ /dev/null @@ -1,33 +0,0 @@ -# BigDL-LLM Portable Executable For Windows: User Guide - -This portable executable includes everything you need to run LLM (except models). Please refer to How to use section to get started. - -## 13B model running on an Intel 11-Gen Core PC (real-time screen capture) - -

- - -

- -## Verified Models - -- ChatGLM2-6b -- Baichuan-13B-Chat -- Baichuan2-7B-Chat -- internlm-chat-7b-8k -- Llama-2-7b-chat-hf - -## How to use - -1. Download the model to your computer. Please ensure there is a file named `config.json` in the model folder, otherwise the script won't work. - - ![](https://llm-assets.readthedocs.io/en/latest/_images/one-click-installer-user-guide-step1.png) - -2. Run `chat.bat` in Terminal and input the path of the model (e.g. `path\to\model`, note that there's no slash at the end of the path). - - ![](https://llm-assets.readthedocs.io/en/latest/_images/one-click-installer-user-guide-step2.png) - -3. Press Enter and wait until model finishes loading. Then enjoy chatting with the model! -4. If you want to stop chatting, just input `stop` and the model will stop running. - - ![](https://llm-assets.readthedocs.io/en/latest/_images/one-click-installer-user-guide-step34.png) diff --git a/python/llm/portable-executable/setup.md b/python/llm/portable-executable/setup.md deleted file mode 100644 index 22520c64075..00000000000 --- a/python/llm/portable-executable/setup.md +++ /dev/null @@ -1,5 +0,0 @@ -# BigDL-LLM Portable Executable Setup Script For Windows - -# How to use - -Just simply run `setup.bat` and it will download and install all dependency and generate a zip file for user to use. diff --git a/python/llm/portable-zip/.gitignore b/python/llm/portable-zip/.gitignore new file mode 100644 index 00000000000..fa79eccd951 --- /dev/null +++ b/python/llm/portable-zip/.gitignore @@ -0,0 +1,2 @@ +python-embed +bigdl-llm.zip \ No newline at end of file diff --git a/python/llm/portable-zip/README.md b/python/llm/portable-zip/README.md new file mode 100644 index 00000000000..a8202f5567a --- /dev/null +++ b/python/llm/portable-zip/README.md @@ -0,0 +1,37 @@ +# BigDL-LLM Portable Zip For Windows: User Guide + +## Introduction + +This portable zip includes everything you need to run an LLM with BigDL-LLM optimizations (except models) . Please refer to [How to use](#how-to-use) section to get started. + +### 13B model running on an Intel 11-Gen Core PC (real-time screen capture) + +

+ + +

+ +### Verified Models + +- ChatGLM2-6b +- Baichuan-13B-Chat +- Baichuan2-7B-Chat +- internlm-chat-7b-8k +- Llama-2-7b-chat-hf + +## How to use + +1. Download the zip from link [here](). +2. (Optional) You could also build the zip on your own. Run `setup.bat` and it will generate the zip file. +3. Unzip `bigdl-llm.zip`. +4. Download the model to your computer. Please ensure there is a file named `config.json` in the model folder, otherwise the script won't work. + + ![](https://llm-assets.readthedocs.io/en/latest/_images/one-click-installer-user-guide-step1.png) + +5. Go into the unzipped folder and double click `chat.bat`. Input the path of the model (e.g. `path\to\model`, note that there's no slash at the end of the path). Press Enter and wait until model finishes loading. Then enjoy chatting with the model! + + ![](https://llm-assets.readthedocs.io/en/latest/_images/one-click-installer-user-guide-step2.png) + +6. If you want to stop chatting, just input `stop` and the model will stop running. + + ![](https://llm-assets.readthedocs.io/en/latest/_images/one-click-installer-user-guide-step34.png) diff --git a/python/llm/portable-executable/chat.bat b/python/llm/portable-zip/chat.bat similarity index 61% rename from python/llm/portable-executable/chat.bat rename to python/llm/portable-zip/chat.bat index b02c961536f..832eb13d588 100644 --- a/python/llm/portable-executable/chat.bat +++ b/python/llm/portable-zip/chat.bat @@ -5,4 +5,6 @@ set PYTHONUNBUFFERED=1 set /p modelpath="Please enter the model path: " -.\python-embed\python.exe .\chat.py --model-path="%modelpath%" \ No newline at end of file +.\python-embed\python.exe .\chat.py --model-path="%modelpath%" + +pause \ No newline at end of file diff --git a/python/llm/portable-executable/chat.py b/python/llm/portable-zip/chat.py similarity index 100% rename from python/llm/portable-executable/chat.py rename to python/llm/portable-zip/chat.py diff --git a/python/llm/portable-executable/setup.bat b/python/llm/portable-zip/setup.bat similarity index 92% rename from python/llm/portable-executable/setup.bat rename to python/llm/portable-zip/setup.bat index de8ad28c273..199902143ca 100644 --- a/python/llm/portable-executable/setup.bat +++ b/python/llm/portable-zip/setup.bat @@ -20,4 +20,4 @@ cd .. %python-embed% -m pip install bigdl-llm[all] transformers_stream_generator tiktoken einops colorama :: compress the python and scripts -powershell -Command "Compress-Archive -Path '.\python-embed', '.\chat.bat', '.\chat.py', '.\README.md' -DestinationPath .\portable-executable.zip" +powershell -Command "Compress-Archive -Path '.\python-embed', '.\chat.bat', '.\chat.py', '.\README.md' -DestinationPath .\bigdl-llm.zip" diff --git a/python/llm/portable-zip/setup.md b/python/llm/portable-zip/setup.md new file mode 100644 index 00000000000..5810a55981a --- /dev/null +++ b/python/llm/portable-zip/setup.md @@ -0,0 +1,5 @@ +# BigDL-LLM Portable Zip Setup Script For Windows + +# How to use + +Just simply run `setup.bat` and it will download and install all dependency and generate `bigdl-llm.zip` for user to use. diff --git a/python/llm/src/bigdl/llm/ggml/quantize.py b/python/llm/src/bigdl/llm/ggml/quantize.py index 7023a4bdb6a..579ee913ce6 100644 --- a/python/llm/src/bigdl/llm/ggml/quantize.py +++ b/python/llm/src/bigdl/llm/ggml/quantize.py @@ -31,7 +31,8 @@ "asym_int5": 7, # q5_1 in ggml "sym_int8": 8, # q8_0 in ggml "nf4": 10, - "nf3": 11} + "nf3": 11, + "fp16": 12} _llama_quantize_type = {"q4_0": 2, "q4_1": 3, @@ -71,7 +72,7 @@ def quantize(input_path: str, output_path: str, :param dtype: Quantization method which differs in the resulting model disk size and inference speed. Defalut to `q4_0`. Difference model family may support different types, now the supported list is: - llama : "q4_0", "q4_1", "q4_2" + llama : "q4_0", "q4_1", "q5_0", "q5_1", "q8_0" bloom : "q4_0", "q4_1" gptneox : "q4_0", "q4_1", "q5_0", "q5_1", "q8_0" starcoder : "q4_0", "q4_1", "q5_0", "q5_1", "q8_0" diff --git a/python/llm/src/bigdl/llm/optimize.py b/python/llm/src/bigdl/llm/optimize.py index dfb0c344764..3169660c067 100644 --- a/python/llm/src/bigdl/llm/optimize.py +++ b/python/llm/src/bigdl/llm/optimize.py @@ -24,6 +24,12 @@ from accelerate.utils import set_module_tensor_to_device from bigdl.llm.ggml.quantize import ggml_tensor_qtype from bigdl.llm.utils.common import invalidInputError +from bigdl.llm.transformers.utils import extract_local_archive_file, get_local_shard_files +import transformers +from transformers import PreTrainedModel +from .utils.common import MuteHFLogger +from .utils.lazy_load_torch import LazyLoadTensors +from contextlib import ExitStack, contextmanager # Simulate the Hugging Face format @@ -37,7 +43,14 @@ def _save_low_bit(self, save_dir, *args, **kwargs): f" load_in_4bit or load_in_low_bit parameter to load a 4-bit model first.") os.makedirs(save_dir, exist_ok=True) model_path = os.path.join(save_dir, PYTORCH_MODEL_NAME) - torch.save(self.state_dict(), model_path, *args, **kwargs) + if isinstance(self, PreTrainedModel): + # We borrowed this method to adapt to Transformer model cases + # as much as possible, and later we may merge these two situations + self.save_pretrained(save_dir) + else: + # TODO: For the lowbit model still larger than 8GB, + # save it into shards. + torch.save(self.state_dict(), model_path, *args, **kwargs) with open(os.path.join(save_dir, CONFIG_NAME), "w") as json_file: json.dump(self._bigdl_config, json_file) @@ -49,14 +62,44 @@ class DisableTorchAllocTensor(): def __init__(self) -> None: self._old_torch_load_state_dict = Module.load_state_dict self._old_torch_to_device = Module.to + self._old_torch_load_from_state_dict = Module._load_from_state_dict + # Chatglm2 init weights manually, + # and `skip_init` init on `cpu` by default + self._old_skip_init = torch.nn.utils.skip_init def __enter__(self): Module.load_state_dict = lambda *args, **kwargs: _IncompatibleKeys([], []) + Module._load_from_state_dict = lambda *args, **kwargs: None Module.to = lambda self, *args, **kwargs: self + def skip_init_on_meta(module_cls, *args, **kwargs): + kwargs['device'] = 'meta' + return self._old_skip_init(module_cls, *args, **kwargs) + torch.nn.utils.skip_init = skip_init_on_meta + def __exit__(self, exc_type, exc_value, traceback): Module.load_state_dict = self._old_torch_load_state_dict + Module._load_from_state_dict = self._old_torch_load_from_state_dict Module.to = self._old_torch_to_device + torch.nn.utils.skip_init = self._old_skip_init + + +class ContextManagers: + """ + Wrapper for `contextlib.ExitStack` which enters a collection of context managers. + Adaptation of `ContextManagers` in the `fastcore` library. + """ + + def __init__(self, context_managers): + self.context_managers = context_managers + self.stack = ExitStack() + + def __enter__(self): + for context_manager in self.context_managers: + self.stack.enter_context(context_manager) + + def __exit__(self, *args, **kwargs): + self.stack.__exit__(*args, **kwargs) def low_bit_sanity_check(model_path): @@ -76,31 +119,49 @@ def low_bit_sanity_check(model_path): return low_bit -def load_low_bit(model_or_creator, model_path, **kwargs): - is_creator = not isinstance(model_or_creator, torch.nn.Module) \ - and callable(model_or_creator) - low_bit = low_bit_sanity_check(model_path) +@contextmanager +def low_memory_init(): + init_contexts = [] + init_contexts.extend([init_empty_weights(), DisableTorchAllocTensor()]) + # Load everything except Tensors' parameters + init_contexts.append(LazyLoadTensors()) + # As we have muted the `torch.load`, this will trigger a key missing warning in hf + # but this matters not for we will load again later. + init_contexts.append(MuteHFLogger(logger=transformers.modeling_utils.logger)) + with ContextManagers(init_contexts): + yield + +def load_low_bit(model, model_path): + low_bit = low_bit_sanity_check(model_path) + invalidInputError(isinstance(model, torch.nn.Module), + "model should be a instance of " + f"`torch.nn.Module`, but got {type(model)} at last.") if low_bit: - # a creator - if is_creator: - with init_empty_weights(), DisableTorchAllocTensor(): - model = model_or_creator(**kwargs) - else: - model = model_or_creator invalidInputError(isinstance(model, torch.nn.Module), - "model_or_creator should be a instance of " - "`torch.nn.Module`or a method that returns " - f"an instance of `torch.nn.Module`, but got {type(model)} at last.") + "model should be an instance of `torch.nn.Module`, " + f"but got {type(model)} at last.") + invalidInputError(model.device.type in ('cpu', 'meta'), + "Expect model on device `cpu` or `meta`, " + f"but got device type {model.device.type}") qtype = ggml_tensor_qtype[low_bit] model = ggml_convert_low_bit(model, qtype=qtype, convert_shape_only=True) - state_dict = torch.load(os.path.join(model_path, PYTORCH_MODEL_NAME)) - if is_creator: + resolved_archive_file, is_sharded = extract_local_archive_file(model_path, subfolder="") + if is_sharded: + # For now only shards transformers models + # can run in this branch. + resolved_archive_file, _ = \ + get_local_shard_files(model_path, + resolved_archive_file, + subfolder="") + else: + resolved_archive_file = [os.path.join(model_path, PYTORCH_MODEL_NAME)] + + for model_file in resolved_archive_file: + state_dict = torch.load(model_file) for param_name, param in state_dict.items(): set_module_tensor_to_device(model, param_name, "cpu", param) - else: - model.load_state_dict(state_dict=state_dict) return model @@ -118,6 +179,12 @@ def optimize_model(model, low_bit='sym_int4', optimize_llm=True): invalidInputError(low_bit in ggml_tensor_qtype, f"Unknown load_in_low_bit value: {low_bit}, expected:" f" sym_int4, asym_int4, sym_int5, asym_int5 or sym_int8.") + invalidInputError(isinstance(model, torch.nn.Module), + "model should be an instance of " + f"`torch.nn.Module`, but got {type(model)} at last.") + invalidInputError(model.device.type == 'cpu', + "Expect model on device `cpu`, " + f"but got device type {model.device.type}") qtype = ggml_tensor_qtype[low_bit] model = ggml_convert_low_bit(model, qtype=qtype, optimize_model=optimize_llm) # add save_low_bit to pretrained model dynamically diff --git a/python/llm/src/bigdl/llm/transformers/convert.py b/python/llm/src/bigdl/llm/transformers/convert.py index b0bc581d9ec..e0c762335ff 100644 --- a/python/llm/src/bigdl/llm/transformers/convert.py +++ b/python/llm/src/bigdl/llm/transformers/convert.py @@ -41,12 +41,13 @@ import warnings import transformers import importlib +from bigdl.llm.ggml.quantize import ggml_tensor_qtype from .utils import logger def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, current_key_name=None, convert_shape_only=False): - from bigdl.llm.transformers.low_bit_linear import LowBitLinear, FP4Params + from bigdl.llm.transformers.low_bit_linear import LowBitLinear, FP4Params, FP16Linear has_been_replaced = False for name, module in model.named_children(): @@ -57,33 +58,55 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, # Check if the current key is not in the `modules_to_not_convert` if not any(key in ".".join(current_key_name) for key in modules_to_not_convert): with init_empty_weights(): - new_linear = LowBitLinear( - module.in_features, - module.out_features, - qtype, - module.bias is not None, - ) + new_linear = None + if qtype != ggml_tensor_qtype["fp16"]: + new_linear = LowBitLinear( + module.in_features, + module.out_features, + qtype, + module.bias is not None, + ) + + device_type = module.weight.data.device.type + # Copy the weights + paramsLowBit = FP4Params(data=module.weight.data, + requires_grad=False, + quantized=False, + _shape=None, + convert_shape_only=convert_shape_only, + qtype=qtype).to(device_type) + new_linear._parameters['weight'] = paramsLowBit + else: + # only support two size now + # may generalize to other sizes + if module.in_features in [4096, 11008]: + # esimd fp16 path + new_linear = FP16Linear( + module.in_features, + module.out_features, + qtype, + module.bias is not None, + ) + device_type = module.weight.data.device.type - device_type = module.weight.data.device.type - # Copy the weights - paramsLowBit = FP4Params(data=module.weight.data, - requires_grad=False, - quantized=False, - _shape=None, - convert_shape_only=convert_shape_only, - qtype=qtype).to(device_type) - new_linear._parameters['weight'] = paramsLowBit + # convert here + m, n = module.weight.data.shape + trans_weight = module.weight.data.reshape(m//16, 16, n) + trans_weight = trans_weight.transpose(1, 2).contiguous() + new_linear._parameters['weight'] = nn.Parameter(trans_weight) - if module.bias is not None: - new_linear._parameters['bias'] = nn.Parameter(module.bias.data)\ - .to(device_type) + # fp16 may generalize to other sizes later + if new_linear is not None: + if module.bias is not None: + new_linear._parameters['bias'] = nn.Parameter(module.bias.data)\ + .to(device_type) - model._modules[name] = new_linear - has_been_replaced = True - # Force requires grad to False to avoid unexpected errors - model._modules[name].requires_grad_(False) + model._modules[name] = new_linear + has_been_replaced = True + # Force requires grad to False to avoid unexpected errors + model._modules[name].requires_grad_(False) - module.weight = None + module.weight = None # Remove the last key for recursion if len(list(module.children())) > 0: diff --git a/python/llm/src/bigdl/llm/transformers/low_bit_linear.py b/python/llm/src/bigdl/llm/transformers/low_bit_linear.py index 931a118f47c..0e3b7dbad46 100644 --- a/python/llm/src/bigdl/llm/transformers/low_bit_linear.py +++ b/python/llm/src/bigdl/llm/transformers/low_bit_linear.py @@ -288,10 +288,10 @@ def ggml_matmul_src1_x_src0_t(src0: torch.Tensor, class MatMulLowBit(torch.autograd.Function): @staticmethod - def forward(ctx, A, weight): + def forward(ctx, A, weight, input_seq_size): ctx.is_empty = False import linear_q4_0 - result = linear_q4_0.forward_new(A, weight.data, weight.qtype) + result = linear_q4_0.forward_new(A, weight.data, weight.qtype, input_seq_size) if any(ctx.needs_input_grad[:2]): ctx.tensors = (A, weight) else: @@ -304,14 +304,14 @@ def backward(ctx, grad_output): if ctx.is_empty: bias_grad = None if ctx.bias is None else torch.zeros_like(ctx.bias) return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None - req_gradA, _ = ctx.needs_input_grad + req_gradA, _, _ = ctx.needs_input_grad A, weight = ctx.tensors grad_A, grad_weight = None, None if req_gradA: dequant_weight = linear_q4_0.dequant(A, weight.data, weight.qtype) grad_A = torch.matmul(grad_output, dequant_weight.reshape(weight._shape)) - return grad_A, grad_weight + return grad_A, grad_weight, None class LowBitLinear(nn.Linear): @@ -353,10 +353,12 @@ def forward(self, x: torch.Tensor): # disable the conversion when training if self.conver_to_half and x_2d.shape[0] > 1 and x_2d.dtype == torch.float32: x_2d = x_2d.half() + input_seq_size = x_shape[1] if self.training and x_2d.requires_grad: - result = MatMulLowBit.apply(x_2d, self.weight) + result = MatMulLowBit.apply(x_2d, self.weight, input_seq_size) else: - result = linear_q4_0.forward_new(x_2d, self.weight.data, self.weight.qtype) + result = linear_q4_0.forward_new(x_2d, self.weight.data, self.weight.qtype, + input_seq_size) new_shape = x_shape[:-1] + (self.out_len,) result = result.view(new_shape) if self.bias is not None: @@ -378,3 +380,53 @@ def forward(self, x: torch.Tensor): result += self.bias return result.to(x.dtype) + + +class FP16Linear(nn.Linear): + def __init__(self, input_features, output_features, qtype, bias=True, + conver_to_half=True): + super().__init__(input_features, output_features, bias) + self.in_len = input_features + self.out_len = output_features + self.weight_shape = (self.out_len, self.in_len) + self.weight_length = self.out_len * self.in_len + self.qtype = qtype + self.conver_to_half = conver_to_half + + def forward(self, x: torch.Tensor): + if self.bias is not None and self.bias.dtype != x.dtype: + self.bias.data = self.bias.data.to(x.dtype) + + x_shape = x.shape + x_2d = x.view(-1, x_shape[-1]) + + x0 = self.weight.data + # only work for GPU + invalidInputError(x0.device.type == "xpu", + "FP16 only works for GPU") + try: + import intel_extension_for_pytorch + import linear_fp16_esimd + except ModuleNotFoundError: + invalidInputError(False, + "Please `pip install bigdl_core_xe` first.") + + if x_2d.is_contiguous() is False: + x_2d = x_2d.contiguous() + + if x_2d.shape[0] > 1: + # first token or batch size > 1, re-convert weight + original_weight = self.weight.data.transpose(1, 2) + original_weight = original_weight.reshape(self.out_len, self.in_len) + result = F.linear(x_2d, original_weight.contiguous()) + del original_weight + else: + # rest token, use esimd optimization + result = linear_fp16_esimd.forward(x_2d, self.weight.data) + + new_shape = x_shape[:-1] + (self.out_len,) + result = result.view(new_shape) + if self.bias is not None: + result += self.bias + + return result.to(x.dtype) diff --git a/python/llm/src/bigdl/llm/transformers/model.py b/python/llm/src/bigdl/llm/transformers/model.py index 8f926517e16..51b2f656f2b 100644 --- a/python/llm/src/bigdl/llm/transformers/model.py +++ b/python/llm/src/bigdl/llm/transformers/model.py @@ -60,7 +60,7 @@ def from_pretrained(cls, :param load_in_4bit: boolean value, True means load linear's weight to symmetric int 4. Default to be False. :param load_in_low_bit: str value, options are sym_int4, asym_int4, sym_int5, asym_int5 - or sym_int8. sym_int4 means symmetric int 4, asym_int4 means + , sym_int8 or fp16. sym_int4 means symmetric int 4, asym_int4 means asymmetric int 4, etc. Relevant low bit optimizations will be applied to the model. :param optimize_model: boolean value, Whether to further optimize the low_bit llm model. @@ -104,8 +104,9 @@ def load_convert(cls, q_k, optimize_model, *args, **kwargs): from .convert import ggml_convert_low_bit invalidInputError(q_k in ggml_tensor_qtype, f"Unknown load_in_low_bit value: {q_k}, expected:" - f" sym_int4, asym_int4, sym_int5, asym_int5 or sym_int8.") + f" sym_int4, asym_int4, sym_int5, asym_int5, sym_int8 or fp16.") qtype = ggml_tensor_qtype[q_k] + # In case it needs a second try, # `from_pretrained`` may pop items out in dict # and lead to args missing. diff --git a/python/llm/src/bigdl/llm/transformers/utils.py b/python/llm/src/bigdl/llm/transformers/utils.py index c0fa9d2a9a6..499765e102f 100644 --- a/python/llm/src/bigdl/llm/transformers/utils.py +++ b/python/llm/src/bigdl/llm/transformers/utils.py @@ -55,7 +55,7 @@ WEIGHTS_INDEX_NAME = "pytorch_model.bin.index.json" -def extract_local_archive_file(pretrained_model_name_or_path, subfolder, variant): +def extract_local_archive_file(pretrained_model_name_or_path, subfolder, variant=None): pretrained_model_name_or_path = str(pretrained_model_name_or_path) if os.path.isfile( os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_NAME, variant)) diff --git a/python/llm/src/bigdl/llm/utils/lazy_load_torch.py b/python/llm/src/bigdl/llm/utils/lazy_load_torch.py new file mode 100644 index 00000000000..4d205b684dd --- /dev/null +++ b/python/llm/src/bigdl/llm/utils/lazy_load_torch.py @@ -0,0 +1,193 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# =========================================================================== +# +# This file is adapted from +# https://github.com/ggerganov/llama.cpp/blob/master/convert.py#L516 +# +# MIT License +# +# Copyright (c) 2023 Georgi Gerganov +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +import torch +from torch.serialization import StorageType +import pickle +import zipfile +import io +from typing import Dict, IO, Any, Callable +from dataclasses import dataclass +from .common import invalidInputError + + +item_size = {torch.bfloat16: 2, + torch.float16: 2, + torch.int: 4, + torch.float: 4, + torch.float32: 4, + torch.int8: 1} + + +@dataclass +class LazyStorage: + load: Callable[[int, int], torch.Tensor] + kind: StorageType + description: str + + +@dataclass +class LazyTensor: + _load: Callable[[], torch.Tensor] + shape: list[int] + data_type: torch.dtype + description: str + + def load(self) -> torch.Tensor: + ret = self._load() + return ret + + def to(self, data_type): + # self.validate_conversion_to(data_type) + + def load() -> torch.Tensor: + print(f"to {data_type}") + return self.load().to(data_type) + return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}') + + +def _load(pickle_fp, map_location, picklemoudle, pickle_file='data.pkl', zip_file=None): + + load_module_mapping: Dict[str, str] = { + 'torch.tensor': 'torch._tensor' + } + + class LazyUnpickler(picklemoudle.Unpickler): + def __init__(self, fp: IO[bytes], data_base_path: str, zip_file: zipfile.ZipFile): + super().__init__(fp) + self.data_base_path = data_base_path + self.zip_file = zip_file + + def persistent_load(self, pid): + data_type = pid[1].dtype + filename_stem = pid[2] + filename = f'{self.data_base_path}/{filename_stem}' + info = self.zip_file.getinfo(filename) + + def load(offset: int, elm_count: int): + dtype = data_type + fp = self.zip_file.open(info) + fp.seek(offset * item_size[dtype]) + size = elm_count * item_size[dtype] + data = fp.read(size) + return torch.frombuffer(bytearray(data), dtype=dtype) + description = f'storage data_type={data_type} ' \ + 'path-in-zip={filename} path={self.zip_file.filename}' + return LazyStorage(load=load, kind=pid[1], description=description) + + @staticmethod + def lazy_rebuild_tensor_v2(storage: Any, + storage_offset: Any, + size: Any, + stride: Any, + requires_grad: Any, + backward_hooks: Any, + metadata: Any = None) -> LazyTensor: + invalidInputError(isinstance(storage, LazyStorage), + "storage should be an instance of class `LazyStorage`, " + f"but get {type(storage)}.") + + def load() -> torch.Tensor: + elm_count = stride[0] * size[0] + return storage.load(storage_offset, elm_count).reshape(size) + description = f'pickled storage_offset={storage_offset} in {storage.description}' + return LazyTensor(load, list(size), storage.kind.dtype, description) + + @staticmethod + def rebuild_from_type_v2(func, new_type, args, state): + return func(*args) + + CLASSES: dict[tuple[str, str], Any] = { + ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'), + ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'), + ('torch', 'Tensor'): LazyTensor, + } + + def find_class(self, mod_name, name): + if (mod_name, name) in self.CLASSES: + return self.CLASSES[(mod_name, name)] + if type(name) is str and 'Storage' in name: + try: + return StorageType(name) + except KeyError: + pass + mod_name = load_module_mapping.get(mod_name, mod_name) + return super().find_class(mod_name, name) + + unpickler = LazyUnpickler(pickle_fp, + data_base_path=pickle_file, + zip_file=zip_file) + result = unpickler.load() + + return result + + +# This can only be used on huggingface transformers loaded from a zip file. +def lazyload( + f, + *args, + **kwargs +): + if isinstance(f, io.BufferedIOBase): + fp = f + else: + fp = open(f, 'rb') + zf = zipfile.ZipFile(fp) + pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')] + invalidInputError(len(pickle_paths) == 1, + "There should be only one pickle_paths found, " + f"but get {pickle_paths}. ") + pickle_fp = zf.open(pickle_paths[0], 'r') + state_dict = _load(pickle_fp, None, pickle, pickle_file=pickle_paths[0][:-4], zip_file=zf) + return state_dict + + +class LazyLoadTensors: + def __init__(self): + self.torch_load = torch.load + + def __enter__(self): + torch.load = lazyload + + def __exit__(self, exc_type, exc_value, traceback): + torch.load = self.torch_load diff --git a/python/llm/test/convert/test_convert_model.py b/python/llm/test/convert/test_convert_model.py index e8cc30792a0..1a0495d6d76 100644 --- a/python/llm/test/convert/test_convert_model.py +++ b/python/llm/test/convert/test_convert_model.py @@ -22,6 +22,7 @@ from bigdl.llm import llm_convert from bigdl.llm.transformers import AutoModelForCausalLM +from bigdl.llm.optimize import optimize_model, load_low_bit, low_memory_init llama_model_path = os.environ.get('LLAMA_ORIGIN_PATH') @@ -87,5 +88,22 @@ def test_transformer_convert_llama_save_load(self): newModel = AutoModelForCausalLM.load_low_bit(tempdir) assert newModel is not None + def test_optimize_transformers_llama(self): + from transformers import AutoModelForCausalLM as AutoCLM + with tempfile.TemporaryDirectory(dir=output_dir) as tempdir: + model = AutoCLM.from_pretrained(llama_model_path, + torch_dtype="auto", + low_cpu_mem_usage=True, + trust_remote_code=True) + model = optimize_model(model) + model.save_low_bit(tempdir) + with low_memory_init(): + new_model = AutoCLM.from_pretrained(tempdir, + torch_dtype="auto", + trust_remote_code=True) + new_model = load_low_bit(new_model, + model_path=tempdir) + assert new_model is not None + if __name__ == '__main__': pytest.main([__file__])