diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 00000000000..073d8e2d9c4 --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,105 @@ +# For most projects, this workflow file will not need changing; you simply need +# to commit it to your repository. +# +# You may wish to alter this file to override the set of languages analyzed, +# or to provide custom queries or build logic. +# +# ******** NOTE ******** +# We have attempted to detect the languages in your repository. Please check +# the `language` matrix defined below to confirm you have the correct set of +# supported CodeQL languages. +# +name: "CodeQL" + +on: + # push: + # branches: [ "main" ] + schedule: + - cron: '24 14 * * 0' + workflow_dispatch: + +jobs: + analyze: + name: Analyze (${{ matrix.language }}) + # Runner size impacts CodeQL analysis time. To learn more, please see: + # - https://gh.io/recommended-hardware-resources-for-running-codeql + # - https://gh.io/supported-runners-and-hardware-resources + # - https://gh.io/using-larger-runners (GitHub.com only) + # Consider using larger runners or machines with greater resources for possible analysis time improvements. + runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} + timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }} + permissions: + # required for all workflows + security-events: write + + # required to fetch internal or private CodeQL packs + packages: read + + # only required for workflows in private repositories + actions: read + contents: read + + strategy: + fail-fast: false + matrix: + include: + - language: python + build-mode: none + # CodeQL supports the following values keywords for 'language': 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' + # Use `c-cpp` to analyze code written in C, C++ or both + # Use 'java-kotlin' to analyze code written in Java, Kotlin or both + # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both + # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis, + # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning. + # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how + # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: ${{ matrix.language }} + build-mode: ${{ matrix.build-mode }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + + # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs + # queries: security-extended,security-and-quality + + # If the analyze step fails for one of the languages you are analyzing with + # "We were unable to automatically build your code", modify the matrix above + # to set the build mode to "manual" for that language. Then modify this step + # to build your code. + # ℹ️ Command-line programs to run using the OS shell. + # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun + - if: matrix.build-mode == 'manual' + shell: bash + run: | + echo 'If you are using a "manual" build mode for one or more of the' \ + 'languages you are analyzing, replace this with the commands to build' \ + 'your code, for example:' + echo ' make bootstrap' + echo ' make release' + exit 1 + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:${{matrix.language}}" + + - name: Generate Security Report + uses: rsdmike/github-security-report-action@v3.0.4 + with: + template: report + token: ${{ secrets.SECURITY_TOKEN }} + + - name: GitHub Upload Release Artifacts + uses: actions/upload-artifact@v2 + with: + name: report + path: | + ./report.pdf diff --git a/.github/workflows/llm-binary-build.yml b/.github/workflows/llm-binary-build.yml index a61bd26e7c3..c38a9a66c0a 100644 --- a/.github/workflows/llm-binary-build.yml +++ b/.github/workflows/llm-binary-build.yml @@ -72,12 +72,6 @@ jobs: export http_proxy=${HTTP_PROXY} export https_proxy=${HTTPS_PROXY} yum install -y gcc-toolset-11 cmake git - conda remove -n python39 --all -y - conda create -n python39 python=3.9 -y - conda remove -n python310 --all -y - conda create -n python310 python=3.10 -y - conda remove -n python311 --all -y - conda create -n python311 python=3.11 -y - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3 with: repository: "intel-analytics/llm.cpp" @@ -109,42 +103,6 @@ jobs: mv build/libstarcoder-api.so release/libstarcoder-api.so mv build/quantize-starcoder release/quantize-starcoder mv build/libstarcoder.so release/libstarcoder_avxvnni.so - - name: Build Chatglm - shell: bash - run: | - source activate python39 || conda activate python39 - cd src/chatglm - scl enable gcc-toolset-11 "cmake -B build" - scl enable gcc-toolset-11 "cmake --build build --config Release -j" - - name: Move Chatglm binaries - shell: bash - run: | - mv src/chatglm/build/main release/main-chatglm_vnni - mv src/chatglm/build/_C.cpython-39-x86_64-linux-gnu.so release/chatglm_C.cpython-39-x86_64-linux-gnu.so - - name: Build Chatglm Py310 - shell: bash - run: | - source activate python310 || conda activate python310 - cd src/chatglm - rm -r build - scl enable gcc-toolset-11 "cmake -B build" - scl enable gcc-toolset-11 "cmake --build build --config Release -j" - - name: Move Chatglm binaries Py310 - shell: bash - run: | - mv src/chatglm/build/_C.cpython-310-x86_64-linux-gnu.so release/chatglm_C.cpython-310-x86_64-linux-gnu.so - - name: Build Chatglm Py311 - shell: bash - run: | - source activate python311 || conda activate python311 - cd src/chatglm - rm -r build - scl enable gcc-toolset-11 "cmake -B build" - scl enable gcc-toolset-11 "cmake --build build --config Release -j" - - name: Move Chatglm binaries Py311 - shell: bash - run: | - mv src/chatglm/build/_C.cpython-311-x86_64-linux-gnu.so release/chatglm_C.cpython-311-x86_64-linux-gnu.so - name: Archive build files uses: actions/upload-artifact@v3 with: @@ -155,9 +113,6 @@ jobs: shell: bash run: | make clean - conda remove -n python39 --all -y - conda remove -n python310 --all -y - conda remove -n python311 --all -y check-linux-avx512-artifact: if: ${{contains(inputs.platform, 'Linux')}} @@ -255,6 +210,7 @@ jobs: path: | avx_release - name: Clean up test environment + if: ${{ always() }} shell: bash run: | make clean @@ -286,8 +242,6 @@ jobs: export http_proxy=${HTTP_PROXY} export https_proxy=${HTTPS_PROXY} yum install -y gcc-toolset-11 cmake git - conda remove -n python39 --all -y - conda create -n python39 python=3.9 -y - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3 with: repository: "intel-analytics/llm.cpp" @@ -299,11 +253,6 @@ jobs: run: | scl enable gcc-toolset-11 "cmake -DONLYAVX=OFF -DONLYAVX2=OFF -B build" scl enable gcc-toolset-11 "cmake --build build --config Release -j" - # build chatglm - source activate python39 || conda activate python39 - cd src/chatglm - scl enable gcc-toolset-11 "cmake -B build" - scl enable gcc-toolset-11 "cmake --build build --config Release -j" - name: Move amx release binary shell: bash run: | @@ -316,9 +265,6 @@ jobs: mv build/libgptneox.so amx_release/libgptneox_amx.so mv build/quantize-starcoder amx_release/quantize-starcoder_amx mv build/libstarcoder.so amx_release/libstarcoder_amx.so - # chatglm binary files - mv src/chatglm/build/main amx_release/main-chatglm_amx - # mv src/chatglm/build/_C.cpython-39-x86_64-linux-gnu.so amx_release/chatglm_C.cpython-39-x86_64-linux-gnu.so - name: Archive amx build files uses: actions/upload-artifact@v3 with: @@ -329,7 +275,6 @@ jobs: shell: bash run: | make clean - conda remove -n python39 --all -y check-windows-avx2-artifact: if: ${{contains(inputs.platform, 'Windows')}} @@ -393,10 +338,6 @@ jobs: needs: check-windows-avx-vnni-artifact if: needs.check-windows-avx-vnni-artifact.outputs.if-exists == 'false' steps: - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: "3.9" - name: Set access token run: | echo "github_access_token=$env:GITHUB_ACCESS_TOKEN" >> $env:GITHUB_ENV @@ -438,47 +379,6 @@ jobs: # mv build/Release/main-starcoder.exe release/main-starcoder_vnni.exe mv build/Release/quantize-starcoder.exe release/quantize-starcoder_vnni.exe mv build/Release/starcoder.dll release/libstarcoder_vnni.dll - - name: Build Chatglm - shell: powershell - run: | - cd src/chatglm - cmake -DAVXVNNI=ON -B build - cmake --build build --config Release -j - - name: Move Chatglm binaries - shell: powershell - run: | - mv src/chatglm/build/Release/main.exe release/main-chatglm_vnni.exe - mv src/chatglm/build/Release/_C.cp39-win_amd64.pyd release/chatglm_C.cp39-win_amd64.pyd - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: "3.10" - - name: Build Chatglm Py310 - shell: powershell - run: | - cd src/chatglm - rm -r build - cmake -DAVXVNNI=ON -B build - cmake --build build --config Release -j - - name: Move Chatglm binaries Py310 - shell: powershell - run: | - mv src/chatglm/build/Release/_C.cp310-win_amd64.pyd release/chatglm_C.cp310-win_amd64.pyd - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: "3.11" - - name: Build Chatglm Py311 - shell: powershell - run: | - cd src/chatglm - rm -r build - cmake -DAVXVNNI=ON -B build - cmake --build build --config Release -j - - name: Move Chatglm binaries Py311 - shell: powershell - run: | - mv src/chatglm/build/Release/_C.cp311-win_amd64.pyd release/chatglm_C.cp311-win_amd64.pyd - name: Archive build files uses: actions/upload-artifact@v3 with: diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml index 9f49a997e50..b8ef55bef67 100644 --- a/.github/workflows/llm_performance_tests.yml +++ b/.github/workflows/llm_performance_tests.yml @@ -97,11 +97,23 @@ jobs: export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 cp python/llm/test/benchmark/arc-perf-test.yaml python/llm/dev/benchmark/all-in-one/config.yaml cd python/llm/dev/benchmark/all-in-one + mkdir test_batch1 + mkdir test_batch2 + # batch_size 1 # hide time info sed -i 's/str(end - st)/"xxxxxx"/g' run.py # change csv name - sed -i 's/{today}/{today}_test1/g' run.py + sed -i 's/{today}/{today}_test1_batch1/g' run.py python run.py + mv *.csv test_batch1 + # batch_size 2 + cd ../../../../../ + cp python/llm/test/benchmark/arc-perf-test-batch2.yaml python/llm/dev/benchmark/all-in-one/config.yaml + cd python/llm/dev/benchmark/all-in-one + # change csv name + sed -i 's/batch1/batch2/g' run.py + python run.py + mv *.csv test_batch2 - name: Test on xpu(transformers==4.37.0) shell: bash @@ -111,36 +123,101 @@ jobs: export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 # upgrade transformers for model Qwen/Qwen1.5-7B-Chat python -m pip install transformers==4.37.0 + # batch_size 1 cp python/llm/test/benchmark/arc-perf-transformers-437.yaml python/llm/dev/benchmark/all-in-one/config.yaml cd python/llm/dev/benchmark/all-in-one # change csv name - sed -i 's/test1/test2/g' run.py + sed -i 's/test1_batch2/test2_batch1/g' run.py + python run.py + mv *.csv test_batch1 + # batch_size 2 + cd ../../../../../ + cp python/llm/test/benchmark/arc-perf-transformers-437-batch2.yaml python/llm/dev/benchmark/all-in-one/config.yaml + cd python/llm/dev/benchmark/all-in-one + # change csv name + sed -i 's/batch1/batch2/g' run.py python run.py + mv *.csv test_batch2 - name: Concat csv and generate html shell: bash run: | - cd python/llm/dev/benchmark/all-in-one - python ../../../test/benchmark/concat_csv.py + # batch_size 1 + cd python/llm/dev/benchmark/all-in-one/test_batch1 + python ../../../../test/benchmark/concat_csv.py for file in *.csv; do if [[ $file != *test* ]]; then - cp "$file" $CSV_SAVE_PATH + cp "$file" $CSV_SAVE_PATH/batch_size_1 fi done python -m pip install pandas==1.5.3 - cd ../../../test/benchmark - python csv_to_html.py -f $CSV_SAVE_PATH + cd ../../../../test/benchmark + python csv_to_html.py -f $CSV_SAVE_PATH/batch_size_1 + # batch_size 2 + cd ../../../../ + cd python/llm/dev/benchmark/all-in-one/test_batch2 + python ../../../../test/benchmark/concat_csv.py + for file in *.csv; do + if [[ $file != *test* ]]; then + cp "$file" $CSV_SAVE_PATH/batch_size_2 + fi + done + cd ../../../../test/benchmark + python csv_to_html.py -f $CSV_SAVE_PATH/batch_size_2 + + - name: Merge and sort csv files of multiple batches and generate html + shell: bash + run: | + cd python/llm/test/benchmark + mkdir merged_temp + # go through all the files and go to merged_temp + cd ../../dev/benchmark/all-in-one/test_batch1 + for file in *.csv; do + if [[ $file != *test* ]]; then + cp "$file" ../../../../test/benchmark/merged_temp + fi + done + cd ../test_batch2 + for file in *.csv; do + if [[ $file != *test* ]]; then + cp "$file" ../../../../test/benchmark/merged_temp + fi + done + cd ../../../../test/benchmark + python merge_csv_batch.py -f ./merged_temp + cd merged_temp + find . -name "*batch*.csv" -delete + for file in *.csv; do + cp "$file" $CSV_SAVE_PATH/merged + done + cd .. + python csv_to_html.py -f $CSV_SAVE_PATH/merged + rm -r merged_temp - name: Check and upload results to ftp shell: bash run: | - cd python/llm/dev/benchmark/all-in-one - python ../../../test/benchmark/check_results.py -c test1 -y ../../../test/benchmark/arc-perf-test.yaml - python ../../../test/benchmark/check_results.py -c test2 -y ../../../test/benchmark/arc-perf-transformers-437.yaml + # batch_size 1 + cd python/llm/dev/benchmark/all-in-one/test_batch1 + python ../../../../test/benchmark/check_results.py -c test1 -y ../../../../test/benchmark/arc-perf-test.yaml + python ../../../../test/benchmark/check_results.py -c test2 -y ../../../../test/benchmark/arc-perf-transformers-437.yaml find . -name "*test*.csv" -delete if [ ${{ github.event_name }} == "schedule" ] || [ ${{ github.event_name }} == "workflow_dispatch" ]; then curl -T ./*.csv ${LLM_FTP_URL}/llm/nightly_perf/gpu/ fi + cd ../ + rm -r test_batch1 + # batch_size 2 + cd test_batch2 + python ../../../../test/benchmark/check_results.py -c test1 -y ../../../../test/benchmark/arc-perf-test-batch2.yaml + python ../../../../test/benchmark/check_results.py -c test2 -y ../../../../test/benchmark/arc-perf-transformers-437-batch2.yaml + find . -name "*test*.csv" -delete + if [ ${{ github.event_name }} == "schedule" ] || [ ${{ github.event_name }} == "workflow_dispatch" ]; then + curl -T ./*.csv ${LLM_FTP_URL}/llm/nightly_perf/gpu/ + fi + cd ../ + rm -r test_batch2 + llm-performance-test-on-spr: if: ${{ github.event.schedule || github.event_name == 'workflow_dispatch' || github.event.inputs.artifact == 'llm-performance-test-on-spr' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests @@ -389,6 +466,14 @@ jobs: sed -i "s/date.today()/\"$date_for_test_version\"/g" python/llm/dev/benchmark/all-in-one/run.py + - name: Add extra warmup for chatglm3-6b int4+fp32 for more stable results + shell: bash + run: | + sed -i '/^\s*result = run_transformer_int4_gpu_win(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, cpu_embedding, batch_size, streaming)/ i\ + if repo_id in ["THUDM/chatglm3-6b"]:\ + run_transformer_int4_gpu_win(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, cpu_embedding, batch_size, streaming) + ' python/llm/dev/benchmark/all-in-one/run.py + - name: Prepare igpu perf test (32-32) shell: bash run: | @@ -414,13 +499,13 @@ jobs: call conda deactivate - - name: Prepare igpu perf test for Qwen1.5 (32-32) + - name: Prepare igpu perf test for transformers 4.37 (32-32) shell: bash run: | sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_437.yaml - - name: Test on igpu for Qwen1.5 (32-32) + - name: Test on igpu for transformers 4.37 (32-32) shell: cmd run: | call conda activate igpu-perf @@ -482,13 +567,13 @@ jobs: call conda deactivate - - name: Prepare igpu perf test for Qwen 1.5 (1024-128) + - name: Prepare igpu perf test for transformers 4.37 (1024-128) shell: bash run: | sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_437.yaml - - name: Test on igpu for Qwen 1.5 (1024-128) + - name: Test on igpu for transformers 4.37 (1024-128) shell: cmd run: | call conda activate igpu-perf @@ -549,13 +634,13 @@ jobs: call conda deactivate - - name: Prepare igpu perf test for Qwen 1.5 (2048-256) + - name: Prepare igpu perf test for transformers 4.37 (2048-256) shell: bash run: | sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_437.yaml - - name: Test on igpu for Qwen 1.5 (2048-256) + - name: Test on igpu for transformers 4.37 (2048-256) shell: cmd run: | call conda activate igpu-perf @@ -616,13 +701,13 @@ jobs: call conda deactivate - - name: Prepare igpu perf test for Qwen 1.5 (load_low_bit 1024-128) + - name: Prepare igpu perf test for transformers 4.37 (load_low_bit 1024-128) shell: bash run: | sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit_437.yaml - - name: Test on igpu for Qwen 1.5 (load_low_bit 1024-128) + - name: Test on igpu for transformers 4.37 (load_low_bit 1024-128) shell: cmd run: | call conda activate igpu-perf @@ -681,13 +766,13 @@ jobs: call conda deactivate - - name: Prepare igpu perf test for Qwen 1.5 (int4+fp16 1024-128) + - name: Prepare igpu perf test for transformers 4.37 (int4+fp16 1024-128) shell: bash run: | sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_437.yaml - - name: Test on igpu for Qwen 1.5 (int4+fp16 1024-128) + - name: Test on igpu for transformers 4.37 (int4+fp16 1024-128) shell: cmd run: | call conda activate igpu-perf diff --git a/README.md b/README.md index 8901f4bb636..d8ed7eb66d3 100644 --- a/README.md +++ b/README.md @@ -169,6 +169,7 @@ Over 50 models have been optimized/verified on `ipex-llm`, including *LLaMA/LLaM | InternLM | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm) | [link](python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm) | | Qwen | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen) | [link](python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen) | | Qwen1.5 | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen1.5) | [link](python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen1.5) | +| Qwen2 | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen2) | [link](python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen2) | | Qwen-VL | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl) | [link](python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen-vl) | | Aquila | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila) | [link](python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila) | | Aquila2 | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2) | [link](python/llm/example/GPU/HF-Transformers-AutoModels/Model/aquila2) | @@ -207,6 +208,7 @@ Over 50 models have been optimized/verified on `ipex-llm`, including *LLaMA/LLaM | CodeGemma | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/codegemma) | [link](python/llm/example/GPU/HF-Transformers-AutoModels/Model/codegemma) | | Command-R/cohere | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/cohere) | [link](python/llm/example/GPU/HF-Transformers-AutoModels/Model/cohere) | | CodeGeeX2 | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/codegeex2) | [link](python/llm/example/GPU/HF-Transformers-AutoModels/Model/codegeex2) | +| MiniCPM | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/minicpm) | [link](python/llm/example/GPU/HF-Transformers-AutoModels/Model/minicpm) | ## Get Support - Please report a bug or raise a feature request by opening a [Github Issue](https://github.com/intel-analytics/ipex-llm/issues) diff --git a/docker/llm/finetune/lora/cpu/docker/Dockerfile b/docker/llm/finetune/lora/cpu/docker/Dockerfile index 0dd34e70b44..4c77f40b88e 100644 --- a/docker/llm/finetune/lora/cpu/docker/Dockerfile +++ b/docker/llm/finetune/lora/cpu/docker/Dockerfile @@ -20,7 +20,7 @@ RUN echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] h RUN mkdir /ipex_llm/data && mkdir /ipex_llm/model && \ # Install python 3.11.1 - apt-get update && apt-get install -y curl wget gpg gpg-agent software-properties-common git gcc g++ make libunwind8-dev zlib1g-dev libssl-dev libffi-dev && \ + apt-get update && apt-get install -y curl wget gpg gpg-agent git gcc g++ make libunwind8-dev zlib1g-dev libssl-dev libffi-dev && \ mkdir -p /opt/python && \ cd /opt/python && \ wget https://www.python.org/ftp/python/3.11.1/Python-3.11.1.tar.xz && \ diff --git a/docker/llm/finetune/qlora/cpu/docker/Dockerfile b/docker/llm/finetune/qlora/cpu/docker/Dockerfile index 553c4d3ac7a..6c908c7b792 100644 --- a/docker/llm/finetune/qlora/cpu/docker/Dockerfile +++ b/docker/llm/finetune/qlora/cpu/docker/Dockerfile @@ -21,7 +21,7 @@ RUN echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] h RUN mkdir -p /ipex_llm/data && mkdir -p /ipex_llm/model && \ # Install python 3.11.1 apt-get update && \ - apt-get install -y curl wget gpg gpg-agent software-properties-common git gcc g++ make libunwind8-dev libbz2-dev zlib1g-dev libssl-dev libffi-dev && \ + apt-get install -y curl wget gpg gpg-agent git gcc g++ make libunwind8-dev libbz2-dev zlib1g-dev libssl-dev libffi-dev && \ mkdir -p /opt/python && \ cd /opt/python && \ wget https://www.python.org/ftp/python/3.11.1/Python-3.11.1.tar.xz && \ diff --git a/docker/llm/finetune/qlora/cpu/docker/Dockerfile.k8s b/docker/llm/finetune/qlora/cpu/docker/Dockerfile.k8s index 469a1bbf631..3c57c38f02f 100644 --- a/docker/llm/finetune/qlora/cpu/docker/Dockerfile.k8s +++ b/docker/llm/finetune/qlora/cpu/docker/Dockerfile.k8s @@ -22,7 +22,7 @@ RUN echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] h RUN mkdir -p /ipex_llm/data && mkdir -p /ipex_llm/model && \ # Install python 3.11.1 apt-get update && apt-get install -y openssh-server openssh-client libcap2-bin gnupg2 ca-certificates \ - curl wget gpg gpg-agent software-properties-common git \ + curl wget gpg gpg-agent git \ gcc g++ make libunwind8-dev zlib1g-dev libssl-dev libffi-dev && \ mkdir -p /opt/python && \ cd /opt/python && \ diff --git a/docker/llm/finetune/xpu/Dockerfile b/docker/llm/finetune/xpu/Dockerfile index e71f0758f02..7af849b422f 100644 --- a/docker/llm/finetune/xpu/Dockerfile +++ b/docker/llm/finetune/xpu/Dockerfile @@ -14,13 +14,12 @@ RUN curl -fsSL https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-P # update dependencies apt-get update && \ # install basic dependencies - apt-get install -y curl wget git gnupg gpg-agent software-properties-common libunwind8-dev vim less && \ + apt-get install -y curl wget git gnupg gpg-agent libunwind8-dev vim less && \ # install Intel GPU driver apt-get install -y intel-opencl-icd intel-level-zero-gpu level-zero level-zero-dev --allow-downgrades && \ # install python 3.11 ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone && \ env DEBIAN_FRONTEND=noninteractive apt-get update && \ - add-apt-repository ppa:deadsnakes/ppa -y && \ apt-get install -y python3.11 python3-pip python3.11-dev python3-wheel python3.11-distutils && \ # avoid axolotl lib conflict apt-get remove -y python3-blinker && apt autoremove -y && \ diff --git a/docker/llm/inference-cpp/Dockerfile b/docker/llm/inference-cpp/Dockerfile index 63e249f998d..3754c5ab477 100644 --- a/docker/llm/inference-cpp/Dockerfile +++ b/docker/llm/inference-cpp/Dockerfile @@ -21,8 +21,7 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO # Install PYTHON 3.11 and IPEX-LLM[xpu] ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone && \ env DEBIAN_FRONTEND=noninteractive apt-get update && \ - apt install software-properties-common libunwind8-dev vim less -y && \ - add-apt-repository ppa:deadsnakes/ppa -y && \ + apt install libunwind8-dev vim less -y && \ apt-get install -y python3.11 git curl wget && \ rm /usr/bin/python3 && \ ln -s /usr/bin/python3.11 /usr/bin/python3 && \ diff --git a/docker/llm/inference/cpu/docker/Dockerfile b/docker/llm/inference/cpu/docker/Dockerfile index 319ffcdff11..f0494ff3cc7 100644 --- a/docker/llm/inference/cpu/docker/Dockerfile +++ b/docker/llm/inference/cpu/docker/Dockerfile @@ -12,22 +12,20 @@ COPY ./start-notebook.sh /llm/start-notebook.sh # Update the software sources RUN env DEBIAN_FRONTEND=noninteractive apt-get update && \ # Install essential packages - apt install software-properties-common libunwind8-dev vim less -y && \ + apt install libunwind8-dev vim less -y && \ # Install git, curl, and wget apt-get install -y git curl wget && \ # Install Python 3.11 - # Add Python 3.11 PPA repository - add-apt-repository ppa:deadsnakes/ppa -y && \ # Install Python 3.11 apt-get install -y python3.11 && \ + # Install Python 3.11 development and utility packages + apt-get install -y python3-pip python3.11-dev python3-wheel python3.11-distutils && \ # Remove the original /usr/bin/python3 symbolic link rm /usr/bin/python3 && \ # Create a symbolic link pointing to Python 3.11 at /usr/bin/python3 ln -s /usr/bin/python3.11 /usr/bin/python3 && \ # Create a symbolic link pointing to /usr/bin/python3 at /usr/bin/python ln -s /usr/bin/python3 /usr/bin/python && \ - # Install Python 3.11 development and utility packages - apt-get install -y python3-pip python3.11-dev python3-wheel python3.11-distutils && \ # Download and install pip, install FastChat from source requires PEP 660 support curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \ python3 get-pip.py && \ diff --git a/docker/llm/inference/xpu/docker/Dockerfile b/docker/llm/inference/xpu/docker/Dockerfile index 6994e7cee6a..03b91f58190 100644 --- a/docker/llm/inference/xpu/docker/Dockerfile +++ b/docker/llm/inference/xpu/docker/Dockerfile @@ -26,8 +26,7 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO # Install PYTHON 3.11 and IPEX-LLM[xpu] ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone && \ env DEBIAN_FRONTEND=noninteractive apt-get update && \ - apt install software-properties-common libunwind8-dev vim less -y && \ - add-apt-repository ppa:deadsnakes/ppa -y && \ + apt install libunwind8-dev vim less -y && \ apt-get install -y python3.11 git curl wget && \ rm /usr/bin/python3 && \ ln -s /usr/bin/python3.11 /usr/bin/python3 && \ diff --git a/docker/llm/serving/cpu/docker/benchmark_vllm_throughput.py b/docker/llm/serving/cpu/docker/benchmark_vllm_throughput.py index 8f2b783a1ba..0a80360f374 100644 --- a/docker/llm/serving/cpu/docker/benchmark_vllm_throughput.py +++ b/docker/llm/serving/cpu/docker/benchmark_vllm_throughput.py @@ -76,9 +76,13 @@ def run_vllm( enable_prefix_caching: bool, gpu_memory_utilization: float = 0.9, load_in_low_bit: str = "sym_int4", + max_num_batched_tokens: int = 10450, ) -> float: from vllm import SamplingParams from ipex_llm.vllm.cpu.engine import IPEXLLMClass as LLM + warm_prompt = "hi " * (1024 - 1) + warm_requests = [(warm_prompt, 1024, 1024) + for _ in range(8)] llm = LLM(model=model, tokenizer=tokenizer, quantization=quantization, @@ -94,6 +98,22 @@ def run_vllm( enable_prefix_caching=enable_prefix_caching, load_in_low_bit=load_in_low_bit) + for prompt, _, output_len in warm_requests: + sampling_params = SamplingParams( + n=n, + temperature=0.0 if use_beam_search else 1.0, + top_p=1.0, + use_beam_search=use_beam_search, + ignore_eos=True, + max_tokens=output_len, + ) + llm._add_request( + prompt=prompt, + prompt_token_ids=None, + sampling_params=sampling_params, + ) + llm._run_engine(use_tqdm=True) + # Add the requests to the engine. for prompt, _, output_len in requests: sampling_params = SamplingParams( @@ -216,7 +236,9 @@ def main(args: argparse.Namespace): args.tensor_parallel_size, args.seed, args.n, args.use_beam_search, args.trust_remote_code, args.dtype, args.max_model_len, args.enforce_eager, args.kv_cache_dtype, args.device, - args.enable_prefix_caching, args.gpu_memory_utilization, args.load_in_low_bit) + args.enable_prefix_caching, args.gpu_memory_utilization, args.load_in_low_bit, + args.max_num_batched_tokens) + elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -320,9 +342,14 @@ def main(args: argparse.Namespace): parser.add_argument( "--load-in-low-bit", type=str, - choices=["sym_int4", "fp8", "fp16"], + choices=["sym_int4", "fp6", "fp8", "fp16"], default="sym_int4", help="Low-bit format quantization with IPEX-LLM") + parser.add_argument('--max-num-batched-tokens', + type=int, + default=10450, + help='maximum number of batched tokens per iteration') + args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/docker/llm/serving/cpu/docker/start-vllm-service.sh b/docker/llm/serving/cpu/docker/start-vllm-service.sh index 4f23adfa24f..b8e442daa2d 100644 --- a/docker/llm/serving/cpu/docker/start-vllm-service.sh +++ b/docker/llm/serving/cpu/docker/start-vllm-service.sh @@ -11,7 +11,7 @@ python -m ipex_llm.vllm.cpu.entrypoints.openai.api_server \ --device cpu \ --dtype bfloat16 \ --enforce-eager \ - --load-in-low-bit sym_int4 \ + --load-in-low-bit bf16 \ --max-model-len 4096 \ --max-num-batched-tokens 10240 \ --max-num-seqs 12 \ diff --git a/docker/llm/serving/cpu/docker/vllm_offline_inference.py b/docker/llm/serving/cpu/docker/vllm_offline_inference.py index 02829a9e46f..23cee000db7 100644 --- a/docker/llm/serving/cpu/docker/vllm_offline_inference.py +++ b/docker/llm/serving/cpu/docker/vllm_offline_inference.py @@ -49,7 +49,7 @@ device="cpu", dtype="bfloat16", enforce_eager=True, - load_in_low_bit="sym_int4", + load_in_low_bit="bf16", tensor_parallel_size=1) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. diff --git a/docker/llm/serving/xpu/docker/Dockerfile b/docker/llm/serving/xpu/docker/Dockerfile index 325239b0a0f..a3511325cb0 100644 --- a/docker/llm/serving/xpu/docker/Dockerfile +++ b/docker/llm/serving/xpu/docker/Dockerfile @@ -12,7 +12,7 @@ RUN cd /llm &&\ # Install ipex-llm[serving] only will update ipex_llm source code without updating # bigdl-core-xe, which will lead to problems apt-get update && \ - apt-get install -y libfabric-dev wrk && \ + apt-get install -y libfabric-dev wrk libaio-dev && \ pip install --pre --upgrade ipex-llm[xpu,serving] && \ pip install transformers==4.37.0 gradio==4.19.2 && \ # Install vLLM-v2 dependencies diff --git a/docker/llm/serving/xpu/docker/benchmark_vllm_throughput.py b/docker/llm/serving/xpu/docker/benchmark_vllm_throughput.py index 01cdb461276..6defa5763d4 100644 --- a/docker/llm/serving/xpu/docker/benchmark_vllm_throughput.py +++ b/docker/llm/serving/xpu/docker/benchmark_vllm_throughput.py @@ -76,6 +76,7 @@ def run_vllm( enable_prefix_caching: bool, gpu_memory_utilization: float = 0.9, load_in_low_bit: str = "sym_int4", + max_num_batched_tokens: int = 5000, ) -> float: from vllm import SamplingParams from ipex_llm.vllm.xpu.engine import IPEXLLMClass as LLM @@ -92,9 +93,30 @@ def run_vllm( kv_cache_dtype=kv_cache_dtype, device=device, enable_prefix_caching=enable_prefix_caching, - load_in_low_bit=load_in_low_bit) + load_in_low_bit=load_in_low_bit, + max_num_batched_tokens=max_num_batched_tokens,) + # Add the requests to the engine. + warm_prompt = "hi " * (1024 - 1) + warm_requests = [(warm_prompt, 1024, 1024) + for _ in range(8)] + for prompt, _, output_len in warm_requests: + sampling_params = SamplingParams( + n=n, + temperature=0.0 if use_beam_search else 1.0, + top_p=1.0, + use_beam_search=use_beam_search, + ignore_eos=True, + max_tokens=output_len, + ) + llm._add_request( + prompt=prompt, + prompt_token_ids=None, + sampling_params=sampling_params, + ) + llm._run_engine(use_tqdm=True) + for prompt, _, output_len in requests: sampling_params = SamplingParams( n=n, @@ -216,7 +238,7 @@ def main(args: argparse.Namespace): args.tensor_parallel_size, args.seed, args.n, args.use_beam_search, args.trust_remote_code, args.dtype, args.max_model_len, args.enforce_eager, args.kv_cache_dtype, args.device, - args.enable_prefix_caching, args.gpu_memory_utilization, args.load_in_low_bit) + args.enable_prefix_caching, args.gpu_memory_utilization, args.load_in_low_bit, args.max_num_batched_tokens) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -323,6 +345,12 @@ def main(args: argparse.Namespace): choices=["sym_int4", "fp8", "fp16"], default="sym_int4", help="Low-bit format quantization with IPEX-LLM") + + parser.add_argument('--max-num-batched-tokens', + type=int, + default=5000, + help='maximum number of batched tokens per iteration') + args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model @@ -354,4 +382,5 @@ def main(args: argparse.Namespace): if args.tokenizer != args.model: raise ValueError("Tokenizer must be the same as the model for MII " "backend.") - main(args) \ No newline at end of file + main(args) + diff --git a/docs/readthedocs/source/doc/LLM/DockerGuides/vllm_cpu_docker_quickstart.md b/docs/readthedocs/source/doc/LLM/DockerGuides/vllm_cpu_docker_quickstart.md index 3795d13b060..36b39ed5662 100644 --- a/docs/readthedocs/source/doc/LLM/DockerGuides/vllm_cpu_docker_quickstart.md +++ b/docs/readthedocs/source/doc/LLM/DockerGuides/vllm_cpu_docker_quickstart.md @@ -40,7 +40,7 @@ After the container is booted, you could get into the container through `docker docker exec -it ipex-llm-serving-cpu-container /bin/bash ``` -## Running vLLM serving with IPEX-LLM on Intel GPU in Docker +## Running vLLM serving with IPEX-LLM on Intel CPU in Docker We have included multiple vLLM-related files in `/llm/`: 1. `vllm_offline_inference.py`: Used for vLLM offline inference example diff --git a/docs/readthedocs/source/doc/LLM/Overview/FAQ/faq.md b/docs/readthedocs/source/doc/LLM/Overview/FAQ/faq.md index c86d109ed12..caf8bd51648 100644 --- a/docs/readthedocs/source/doc/LLM/Overview/FAQ/faq.md +++ b/docs/readthedocs/source/doc/LLM/Overview/FAQ/faq.md @@ -73,3 +73,7 @@ Make sure intel-basekit>=2024.0.1-43 and intel-level-zero-gpu>=1.3.27191.42-775~ ### Too many open files You may encounter this error during finetuning, expecially when run 70B model. Please raise the system open file limit using `ulimit -n 1048576`. + +### `RuntimeError: could not create a primitive` on Windows + +This error may happen when multiple GPUs exists for Windows Users. To solve this error, you can open Device Manager (search "Device Manager" in your start menu). Then click the "Display adapter" option, and disable all the GPU device you do not want to use. Restart your computer and try again. IPEX-LLM should work fine this time. \ No newline at end of file diff --git a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/langchain_api.md b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/langchain_api.md index e51d029a730..46a7adb31ca 100644 --- a/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/langchain_api.md +++ b/docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/langchain_api.md @@ -31,7 +31,7 @@ You may also convert Hugging Face *Transformers* models into native INT4 format, ```eval_rst .. note:: - * Currently only llama/bloom/gptneox/starcoder/chatglm model families are supported; for other models, you may use the Hugging Face ``transformers`` INT4 format as described `above <./langchain_api.html#using-hugging-face-transformers-int4-format>`_. + * Currently only llama/bloom/gptneox/starcoder model families are supported; for other models, you may use the Hugging Face ``transformers`` INT4 format as described `above <./langchain_api.html#using-hugging-face-transformers-int4-format>`_. * You may choose the corresponding API developed for specific native models to load the converted model. ``` @@ -41,9 +41,9 @@ from ipex_llm.langchain.llms import LlamaLLM from ipex_llm.langchain.embeddings import LlamaEmbeddings from langchain.chains.question_answering import load_qa_chain -# switch to ChatGLMEmbeddings/GptneoxEmbeddings/BloomEmbeddings/StarcoderEmbeddings to load other models +# switch to GptneoxEmbeddings/BloomEmbeddings/StarcoderEmbeddings to load other models embeddings = LlamaEmbeddings(model_path='/path/to/converted/model.bin') -# switch to ChatGLMLLM/GptneoxLLM/BloomLLM/StarcoderLLM to load other models +# switch to GptneoxLLM/BloomLLM/StarcoderLLM to load other models ipex_llm = LlamaLLM(model_path='/path/to/converted/model.bin') doc_chain = load_qa_chain(ipex_llm, ...) diff --git a/docs/readthedocs/source/doc/LLM/Quickstart/vLLM_quickstart.md b/docs/readthedocs/source/doc/LLM/Quickstart/vLLM_quickstart.md index 193579d85c3..7c5b5cbcde4 100644 --- a/docs/readthedocs/source/doc/LLM/Quickstart/vLLM_quickstart.md +++ b/docs/readthedocs/source/doc/LLM/Quickstart/vLLM_quickstart.md @@ -134,6 +134,12 @@ You can tune the service using these four arguments: 3. `--max-num-batched-token`: Maximum number of batched tokens per iteration. 4. `--max-num-seq`: Maximum number of sequences per iteration. Default: 256 +For longer input prompt, we would suggest to use `--max-num-batched-token` to restrict the service. The reason behind this logic is that the `peak GPU memory usage` will appear when generating first token. By using `--max-num-batched-token`, we can restrict the input size when generating first token. + +`--max-num-seqs` will restrict the generation for both first token and rest token. It will restrict the maximum batch size to the value set by `--max-num-seqs`. + +When out-of-memory error occurs, the most obvious solution is to reduce the `gpu-memory-utilization`. Other ways to resolve this error is to set `--max-num-batched-token` if peak memory occurs when generating first token or using `--max-num-seq` if peak memory occurs when generating rest tokens. + If the service have been booted successfully, the console will display messages similar to the following: diff --git a/docs/readthedocs/source/doc/PythonAPI/LLM/langchain.rst b/docs/readthedocs/source/doc/PythonAPI/LLM/langchain.rst index 77add609ac3..e0a2f770234 100644 --- a/docs/readthedocs/source/doc/PythonAPI/LLM/langchain.rst +++ b/docs/readthedocs/source/doc/PythonAPI/LLM/langchain.rst @@ -31,7 +31,7 @@ IPEX-LLM provides ``TransformersLLM`` and ``TransformersPipelineLLM``, which imp Native Model ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, you could also use the following LLM wrappers with the native (cpp) implementation for maximum performance. +For ``llama``/``bloom``/``gptneox``/``starcoder`` model families, you could also use the following LLM wrappers with the native (cpp) implementation for maximum performance. .. tabs:: @@ -47,18 +47,6 @@ For ``llama``/``chatglm``/``bloom``/``gptneox``/``starcoder`` model families, yo .. automethod:: stream .. automethod:: get_num_tokens - .. tab:: ChatGLM - - .. autoclass:: ipex_llm.langchain.llms.ChatGLMLLM - :members: - :undoc-members: - :show-inheritance: - :exclude-members: ggml_model, ggml_module, client, model_path, kwargs - - .. automethod:: validate_environment - .. automethod:: stream - .. automethod:: get_num_tokens - .. tab:: Bloom .. autoclass:: ipex_llm.langchain.llms.BloomLLM diff --git a/docs/readthedocs/source/index.rst b/docs/readthedocs/source/index.rst index 1071b5692ab..fb883121dac 100644 --- a/docs/readthedocs/source/index.rst +++ b/docs/readthedocs/source/index.rst @@ -363,6 +363,13 @@ Verified Models link + + Qwen2 + + link + + link + Qwen-VL @@ -618,6 +625,13 @@ Verified Models link + + MiniCPM + + link + + link + diff --git a/python/llm/dev/benchmark/all-in-one/config.yaml b/python/llm/dev/benchmark/all-in-one/config.yaml index 447beab4b74..2ff2b3f2ccc 100644 --- a/python/llm/dev/benchmark/all-in-one/config.yaml +++ b/python/llm/dev/benchmark/all-in-one/config.yaml @@ -12,12 +12,11 @@ in_out_pairs: - '32-32' - '1024-128' test_api: - - "transformer_int4_gpu" # on Intel GPU, transformer-like API, (qtype=int4) - # - "transformer_int4_gpu_win" # on Intel GPU for Windows, transformer-like API, (qtype=int4) - # - "transformer_int4_fp16_gpu" # on Intel GPU, transformer-like API, (qtype=int4), (dtype=fp16) + - "transformer_int4_fp16_gpu" # on Intel GPU, transformer-like API, (qtype=int4), (dtype=fp16) # - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, transformer-like API, (qtype=int4), (dtype=fp16) + # - "transformer_int4_gpu" # on Intel GPU, transformer-like API, (qtype=int4), (dtype=fp32) + # - "transformer_int4_gpu_win" # on Intel GPU for Windows, transformer-like API, (qtype=int4), (dtype=fp32) # - "transformer_int4_loadlowbit_gpu_win" # on Intel GPU for Windows, transformer-like API, (qtype=int4), use load_low_bit API. Please make sure you have used the save.py to save the converted low bit model - # - "ipex_fp16_gpu" # on Intel GPU, use native transformers API, (dtype=fp16) # - "bigdl_fp16_gpu" # on Intel GPU, use ipex-llm transformers API, (dtype=fp16), (qtype=fp16) # - "optimize_model_gpu" # on Intel GPU, can optimize any pytorch models include transformer model # - "deepspeed_optimize_model_gpu" # on Intel GPU, deepspeed autotp inference @@ -35,8 +34,9 @@ test_api: # - "deepspeed_transformer_int4_cpu" # on Intel CPU, deepspeed autotp inference # - "transformer_int4_fp16_lookahead_gpu" # on Intel GPU, transformer-like API, with lookahead, (qtype=int4), (dtype=fp16) cpu_embedding: False # whether put embedding to CPU -streaming: False # whether output in streaming way (only avaiable now for gpu win related test_api) -use_fp16_torch_dtype: True # whether use fp16 for non-linear layer (only avaiable now for "pipeline_parallel_gpu" test_api) -n_gpu: 2 # number of GPUs to use (only avaiable now for "pipeline_parallel_gpu" test_api) +streaming: False # whether output in streaming way (only available now for gpu win related test_api) +use_fp16_torch_dtype: True # whether use fp16 for non-linear layer (only available now for "pipeline_parallel_gpu" test_api) +n_gpu: 2 # number of GPUs to use (only available now for "pipeline_parallel_gpu" test_api) lookahead: 3 max_matching_ngram_size: 2 +task: 'continuation' # when test_api is "transformer_int4_fp16_lookahead_gpu", task could be 'QA', 'continuation' or 'summarize' diff --git a/python/llm/dev/benchmark/all-in-one/prompt/QA/orca_401.txt b/python/llm/dev/benchmark/all-in-one/prompt/QA/orca_401.txt new file mode 100644 index 00000000000..00cc54d1676 --- /dev/null +++ b/python/llm/dev/benchmark/all-in-one/prompt/QA/orca_401.txt @@ -0,0 +1,6 @@ +[INST] <> + +<> + +Q:Information: - Mark Andes (born February 19, 1948) is an American musician, known for his work as a bassist with Canned Heat, Spirit, Jo Jo Gunne, Firefall, Heart, and Mirabal. - Jo Jo Gunne is a rock band, formed in Los Angeles, California, in 1971 by Jay Ferguson (keyboards, vocals and guitar) and Mark Andes (bass guitar and vocals) after they had left Spirit. The group's name is derived from "Jo Jo Gunne", a Chuck Berry song that peaked at #83 as a single in November 1958. - A county seat is an administrative center, seat of government, or capital city of a county or civil parish. The term is used in the United States, Canada, Romania, China and Taiwan. In the United Kingdom and Ireland, county towns have a similar function. - The 2010 United States Census, (known as "Census 2010"), is the twenty-third and currently most recent United States national census. National Census Day, the reference day used for the census, was April 1, 2010. As part of a drive to increase the count's accuracy, 635,000 temporary enumerators were hired. The population of the United States was counted as 308,745,538, a 9.7% increase from the 2000 Census. - Rock Band is a series of music video games developed by Harmonix and MTV Games, and distributed by Electronic Arts for the Nintendo DS, iOS, PlayStation 2, PlayStation 3, PSP, Wii, Xbox One and Xbox 360 game systems. The series, inspired by Harmonix's previous efforts on the "Guitar Hero" series, allows up to four players to simulate the performance of popular rock music songs by playing with controllers modeled after musical instruments. Players can play the lead guitar, bass guitar, keyboard, and drums parts to songs, as well as sing into a USB microphone. Players are scored on their ability to match scrolling musical notes while playing instruments, and by their ability to match the singer's pitch on vocals. - Oklahoma (Cherokee: "Asgaya gigageyi" / ; or transliterated from English as ("òàlàhoma"), Pawnee: "Uukuhuúwa", Cayuga: "Gahnawiyogeh") is a state located in the South Central United States. Oklahoma is the 20th-most extensive and the 28th-most populous of the 50 United States. The state's name is derived from the Choctaw words "okla" and "humma", meaning "red people". It is also known informally by its nickname, "The Sooner State", in reference to the non-Native settlers who staked their claims on the choicest pieces of land before the official opening date, and the Indian Appropriations Act of 1889, which opened the door for white settlement in America's Indian Territory. The name was settled upon statehood, Oklahoma Territory and Indian Territory were merged and Indian was dropped from the name. On November 16, 1907, Oklahoma became the 46th state to enter the union. Its residents are known as "Oklahomans", or informally "Okies", and its capital and largest city is Oklahoma City. - Texas is the second largest state in the United States by both area and population. Geographically located in the south central part of the country, Texas shares borders with the U.S. states of Louisiana to the east, Arkansas to the northeast, Oklahoma to the north, New Mexico to the west, and the Mexican states of Chihuahua, Coahuila, Nuevo León, and Tamaulipas to the southwest, while the Gulf of Mexico is to the southeast. - Nuevo León, or New Leon, officially the Free and Sovereign State of Nuevo León, is one of the 31 states which, with Mexico City, compose the 32 Federal Entities of Mexico. It is divided into 51 municipalities and its capital city is Monterrey. - Dallas is a major city in the state of Texas and is the largest urban center of the fourth most populous metropolitan area in the United States. The city proper ranks ninth in the U.S. and third in Texas after Houston and San Antonio. The city's prominence arose from its historical importance as a center for the oil and cotton industries, and its position along numerous railroad lines. The bulk of the city is in Dallas County, of which it is the county seat; however, sections of the city are located in Collin, Denton, Kaufman, and Rockwall counties. According to the 2010 United States Census, the city had a population of 1,197,816. The United States Census Bureau's estimate for the city's population increased to 1,300,092 as of July 1, 2015. - San Antonio (Spanish for "Saint Anthony"), officially the City of San Antonio, is the seventh-most populated city in the United States and the second-most populous city in the state of Texas, with a population of 1,409,019. It was the fastest growing of the top 10 largest cities in the United States from 2000 to 2010, and the second from 1990 to 2000. The city straddles South Texas and Central Texas and is on the southwestern corner of an urban megaregion known as the Texas Triangle. - Jimmie Randall ( born Dallas , Texas , February 14 , 1949 ) is a bass guitarist best known for his work with Jo Jo Gunne . - The guitar is a musical instrument classified as a string instrument with anywhere from four to 18 strings, usually having six. The sound is projected either acoustically, using a hollow wooden or plastic and wood box (for an acoustic guitar), or through electrical amplifier and a speaker (for an electric guitar). It is typically played by strumming or plucking the strings with the fingers, thumb and/or fingernails of the right hand or with a pick while fretting (or pressing against the frets) the strings with the fingers of the left hand. The guitar is a type of chordophone, traditionally constructed from wood and strung with either gut, nylon or steel strings and distinguished from other chordophones by its construction and tuning. The modern guitar was preceded by the gittern, the vihuela, the four-course Renaissance guitar, and the five-course baroque guitar, all of which contributed to the development of the modern six-string instrument. - Coahuila, formally Coahuila de Zaragoza, officially the Free and Sovereign State of Coahuila de Zaragoza, is one of the 31 states which, along with Mexico City, compose the 32 Federal Entities of Mexico. The state is located in Northeastern Mexico on the US border. - Tamaulipas, officially the Free and Sovereign State of Tamaulipas, is one of the 31 states which, with Mexico City, comprise the 32 Federal Entities of Mexico. It is divided into 43 municipalities and its capital city is Ciudad Victoria. The capital city was named after Guadalupe Victoria, the first President of Mexico. - Charles Edward Anderson "Chuck" Berry (born October 18, 1926) is an American guitarist, singer and songwriter and is one of the pioneers of rock and roll music. With songs such as "Maybellene" (1955), "Roll Over Beethoven" (1956), "Rock and Roll Music" (1957) and "Johnny B. Goode" (1958), Berry refined and developed rhythm and blues into the major elements that made rock and roll distinctive, with lyrics focusing on teen life and consumerism and music featuring guitar solos and showmanship that were a major influence on subsequent rock music. - New Mexico is a state located in the southwestern region of the United States of America. It was admitted to the union as the 47th state on January 6, 1912. It is usually considered one of the Mountain States. New Mexico is fifth by area, the 36th. Given the information above, choose from the list below the object entity that exhibits the relation 'occupation' with the subject 'jimmie randall'. Choices: - band - canada - construction - count - guitarist - hero - major - musician - official - saint - singer - songwriter - sovereign - speaker - united states of america +A: [/INST] \ No newline at end of file diff --git a/python/llm/dev/benchmark/all-in-one/prompt/QA/orca_497.txt b/python/llm/dev/benchmark/all-in-one/prompt/QA/orca_497.txt new file mode 100644 index 00000000000..25fc62cef28 --- /dev/null +++ b/python/llm/dev/benchmark/all-in-one/prompt/QA/orca_497.txt @@ -0,0 +1,101 @@ +[INST] <> +You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps. +<> + +News article: + +(Carolyn Kaster/AP Photo) + + Weary of waiting for an economic recovery worth its name, a frustrated American public has sent Barack Obama's job approval rating to a career low - with a majority in the latest ABC News/Washington Post poll favoring a Republican Congress to act as a check on his policies. + + Registered voters by 53-39 percent in the national survey say they'd rather see the Republicans in control of Congress as a counterbalance to Obama's policies than a Democratic-led Congress to help support him. It was similar in fall 2010, when the Republicans took control of the House of Representatives and gained six Senate seats. + + See PDF with full results and charts here. + + Obama's job approval rating, after a slight winter rebound, has lost 5 points among all adults since March, to 41 percent, the lowest of his presidency by a single point. Fifty-two percent disapprove, with "strong" disapproval exceeding strong approval by 17 percentage points. He's lost ground in particular among some of his core support groups. + + Economic discontent remains the driving element in political views in this survey, produced for ABC by Langer Research Associates. Americans rate the condition of the economy negatively by 71-29 percent - the least bad since November 2007, but still dismal by any measure. Only 28 percent think the economy's improving, down by 9 points since just before Obama won his second term. He gets just 42 percent approval for handling it. + + Economic views are strongly related to political preferences. Among people who see the economy improving, 65 percent prefer Democratic control of Congress, while among those who see the economy as stagnant or worsening, 62 percent favor Republican control. Notably, economic views are linked with preferences for control of Congress regardless of people's partisan affiliation. + + The results suggest the corrosive effects of the long downturn on the president's popularity: Among those who say the economy is in bad shape, Obama's overall approval rating has lost 20 points since February 2012, from 46 percent then to 26 percent now. + + The president faces other challenges. While he's hailed insurance exchange sign-ups as a marker of the Affordable Care Act's success, the program and his rating for handling it have lost ground, both down from their levels late last month after the Healthcare.gov website was stabilized. The law gets 44 percent support, down 5 points; Obama has just 37 percent approval for its implementation, down 7. + + One reason is that the law seems to have opened an avenue for public ire about health care costs to be directed at the administration. Six in 10 blame the ACA for increasing costs nationally, and 47 percent think it's caused their own health care expenses to rise. Regardless of whether or how much those costs would have risen otherwise, Obamacare is taking a heavy dose of the blame. + + Separately, a current issue on the world stage offers no respite for Obama: Given continued tensions over Ukraine, just 34 percent of Americans approve of how he's handling that situation, 8 points fewer than early last month. Forty-six percent disapprove, with two in 10 withholding judgment. + + DISCONTENT/MIDTERMS - With these and other problems - but chiefly the economy - the public by more than 2-1, 66-30 percent, says the country's headed seriously off on the wrong track. That's about where it's been lately, and more negative than a year ago. + + General anti-incumbency results: Just 22 percent of Americans say they're inclined to re-elect their representative in Congress, unchanged from last month as the fewest in ABC/Post polls dating back 25 years. + + Another outcome is risk for the president's party, in punishment for his handling of the helm. A single point divides Democratic and Republican candidates for the House in preference among registered voters, 45-44 percent. Among those who say they're certain to vote (with Republicans more apt to show up in midterms), that goes to 44-49 percent. + + Independents, a sometimes swing-voting group, favor Republican House candidates by 55-32 percent (among those who say they're certain to vote). And, as with views on control of Congress, perceptions of the economy correlate with congressional vote preference, regardless of partisanship. + + ISSUES - None of this means the GOP is home free. A robust improvement in the economy could change the equation. (As many, at least, say it's currently holding steady, 35 percent, as think it's getting worse, 36 percent.) And even as the brunt of economic unhappiness falls on the president, the public divides essentially evenly on which party they trust more to handle the economy - suggesting that the Republicans have yet to present a broadly appealing alternative. + + In another example, for all of Obamacare's controversies, the Democrats hold a slight 8-point edge in trust to handle health care, again indicating that the Republicans have yet to seize the opportunity to present a compelling solution of their own. Indeed, the Democrats have a 6-point lead in trust to handle "the main problems the nation faces" - although, as with all others, that narrows among likely voters, in this case to 37-40 percent, a numerical (but not significant) GOP edge. + + The Republicans have a 9-point advantage in trust to handle the federal deficit - an improvement for the party from last month. Similarly, Americans by a 7-point margin trust the Republicans over Obama to find the right mix of spending to cut and federal programs to maintain. The president had an 11-point lead on that question just after the partial government shutdown last fall. + + The Democrats push back with two results that they're likely to stress as the November election draws closer: One is a broad, 20-point advantage, 52-32 percent, in trust over the Republicans to help the middle class (but again, this narrows among likely voters). The other is an even wider, 30-point lead, 55-25 percent, in trust to handle issues of particular concern to women. + + The Republicans have some vulnerability in other areas, as well. Americans say the Democratic Party comes closer than the GOP to their positions on climate change, by 18 points; whether or not to raise the minimum wage, by 16 points; gay marriage, by 14 points; and the issue of abortion, by 8 points. On one remaining issue, gun control, the Republicans have a slight, 5-point edge. + + HEALTH CARE - Obamacare, for its part, is a subject the Republicans have sought to turn to their advantage in the midterm elections, and the poll results show ample opportunity. + + Costs are a particular target. As noted, 47 percent of Americans feel that their health care costs are rising as a result of the ACA; 58 percent say the same about the overall costs of health care nationally. Just 8 and 11 percent, respectively, say the law has decreased these costs. If there's a case to be made that costs would have risen anyway - or that they would have risen faster absent the ACA - it's yet to resonate with large segments of the population. + + Other assessments also are critical. The public by a 20-point margin, 44-24 percent, is more apt to say the law has made the overall health care system worse rather than better (although the number who say it's made things better is up by 5 points from December). The rest, 29 percent, see no change. Americans by 29-14 percent likewise say the ACA has made their own care worse rather than better, with more, 53 percent, reporting no impact. + + Despite the website's improvements, half say the law's implementation is going worse than they expected when it began, vs. 41 percent better - another sign of the persistent antipathy that's dogged Obamacare from the start. + + The poll also shows both the striking partisan division on Obamacare and the extent to which, on several questions, independents side more with Republicans on the issue. Thirty-eight percent of Democrats, for instance, say the ACA has increased health care costs nationally; that soars to 67 percent of independents and 73 percent of Republicans. And while 47 percent of Democrats think it's made the health care system better, just 6 and 16 percent of Republicans and independents, respectively, agree. + + OBAMA/GROUPS - Divisions among groups remain especially stark in terms of Obama's ratings; further, as noted, he's lost ground in some of his core support groups. The president's approval rating since early March has lost 14 points among liberals, 12 points among people with postgraduate degrees, 10 points among urban residents, 9 points among Democrats and 7 points among those with incomes less than $50,000. He's lost 9 points among independents as well. + + With 41 percent approval overall (his previous low was 42 percent last November and the same in October 2011), Obama's at new lows among nonwhites (61-34 percent, approve-disapprove) and liberals (63-31 percent), and matches his lows among moderates (46-48 percent) and independents (33-59 percent). His rating among Democrats, 74-22 percent, is a single point from its low. + + Other results also mark the extent of the difficulties facing Obama and his party alike. A form of statistical analysis called regression finds that, as noted above, views on the economy correlate both with congressional vote preference, and views on which party should control Congress, independently of partisan affiliation. That suggests that the Democrats are in serious need of a positive shift in economic views. + + That may be hard to accomplish. While 50 percent of Democrats say the economy's in good shape, that plummets not only among Republicans but independents as well, to 12 and 22 percent, respectively. And while 46 percent of Democrats see improvement in the economy, again just 22 percent of independents, and 15 percent of Republicans, agree. + + Preferences on which party controls Congress may reflect a general inclination in favor of divided government - and don't always predict outcomes, as in 2002, when more registered voters preferred Democratic control yet the GOP held its ground. It's striking, nonetheless, that this poll finds Republican control favored not only in the 2012 red states, by 56-36 percent, but also by 51-41 percent in the blue states that backed Obama fewer than two years ago. + + METHODOLOGY - This ABC News/Washington Post poll was conducted by telephone April 24-27, 2014, in English and Spanish, among a random national sample of 1,000 adults, including landline and cell-phone-only respondents. Results have a margin of sampling error of 3.5 points, including design effect. Partisan divisions are 32-21-38 percent, Democrats-Republicans-independents. + + The survey was produced for ABC News by Langer Research Associates of New York, N.Y., with sampling, data collection and tabulation by Abt-SRBI of New York, N.Y. ||||| President Obama’s approval rating fell to 41 percent, down from 46 percent through the first three months of the year and the lowest of his presidency in Washington Post-ABC News polls. (Charles Dharapak/AP) + + Democrats face serious obstacles as they look to the November elections, with President Obama’s approval rating at a new low and a majority of voters saying they prefer a Congress in Republican hands to check the president’s agenda, according to a new Washington Post-ABC News poll. + + Obama’s approval rating fell to 41 percent, down from 46 percent through the first three months of the year and the lowest of his presidency in Post-ABC News polls. Just 42 percent approve of his handling of the economy, 37 percent approve of how he is handling the implementation of the Affordable Care Act and 34 percent approve of his handling of the situation involving Ukraine and Russia. + + Obama’s low rating could be a significant drag on Democratic candidates this fall — past elections suggest that when approval ratings are as low as Obama’s, the president’s party is almost certain to suffer at the ballot box in November. + + Republicans are favored to maintain control of the House, with the focus now on whether they can take control of the Senate. One key question about November is who will vote. Turnout in midterm elections is always lower than in presidential elections, and at this point, key elements of the Republican coalition — namely white voters and older voters — say they are more certain to cast ballots this fall than are younger voters and minorities, two groups that Democrats and Obama relied on in 2008 and 2012. + + Democrats are not without assets as the midterm election campaigns intensify. Americans trust Democrats over Republicans by 40 to 34 percent to handle the country’s main problems. By significant margins, Americans see Democrats as better for the middle class and on women’s issues. Americans favor the Democrats’ positions on raising the minimum wage, same-sex marriage and on the broad issue of dealing with global climate change. + + View Graphic Obama receives low marks as Democrats face midterm turnout challenge + + Led by Obama, Democrats have sought to use many of these issues to draw contrasts with Republicans, both nationally and in states with the most competitive races. As yet, however, there is little evidence that those assets outweigh either the normal midterm disadvantages of the party that holds the White House or the dissatisfaction with the general direction of the country and Obama’s leadership generally. + + The Affordable Care Act is expected to be a major issue in the midterm elections. Obama recently urged Democrats to defend the law energetically, particularly after the administration announced that 8 million people signed up for it during the initial enrollment period. Republicans are confident that opposition to the new law will energize their supporters. + + The Post-ABC poll found that 44 percent say they support the law while 48 percent say they oppose it, which is about where it was at the end of last year and in January. Half of all Americans also say they think implementation is worse than expected. + + Last month, a Post-ABC poll found 49 percent of Americans saying they supported the new law compared with 48 percent who opposed it. That finding was more positive for the administration than most other polls at the time. Democrats saw it as a possible leading indicator of a shift in public opinion, but that has not materialized. + + A 58 percent majority say the new law is causing higher costs overall, and 47 percent say it will make the health-care system worse. While a majority say the quality of the health care they receive will remain the same, a plurality expect it to result in higher personal costs for that care. + + A number of Democratic strategists are urging their candidates to campaign on a message that calls for continued implementation of the law, with some fixes. These strategists say that message is more popular than the “repeal and replace” theme of the Republicans. A separate poll Tuesday from the Kaiser Family Foundation finds nearly six in 10 want Congress to improve the law rather than repeal it and replace it with something new. + + Democrats are hoping to put Republicans on the defensive on the question of “what next” for the Affordable Care Act. Republicans say they remain confident that the health-care issue will help them more in November. + + Pessimism about the economy also persists, with more than seven in 10 describing the economy in negative terms. Public attitudes about the future of the economy are anything but rosy. Just 28 percent say they think the economy is getting better, while 36 percent say it is getting worse and 35 percent say it’s staying the same. + + Americans express continued discontent about the country’s direction, with two-thirds saying things are on the wrong track. Asked whether each party’s incumbents deserve relection, at least six in 10 say they do not. + + Among registered voters, 45 percent intend to vote for the Democratic candidate in House elections this fall, and 44 percent for the Republican candidate. Based on past elections, that close margin is troubling news for Democrats.. What is a shorter version of the above article? [/INST] \ No newline at end of file diff --git a/python/llm/dev/benchmark/all-in-one/prompt/QA/orca_776.txt b/python/llm/dev/benchmark/all-in-one/prompt/QA/orca_776.txt new file mode 100644 index 00000000000..d70fea4b40c --- /dev/null +++ b/python/llm/dev/benchmark/all-in-one/prompt/QA/orca_776.txt @@ -0,0 +1,6 @@ +[INST] <> +You are an AI assistant that follows instruction extremely well. Help as much as you can. +<> + +This article: At Birmingham, Oliphant's team had reached a different conclusion. Oliphant had delegated the task to two German refugee scientists, Rudolf Peierls and Otto Frisch, who could not work on Oliphant's radar project because they were enemy aliens and therefore lacked the necessary security clearance. Francis Perrin had calculated the critical mass of uranium to be about 40 tonnes (39 long tons; 44 short tons). He reckoned that if a neutron reflector were placed around it, this might be reduced to 12 tonnes (12 long tons; 13 short tons). Peierls attempted to simplify the problem by using the fast neutrons produced by fission, thus omitting consideration of moderator. He too calculated the critical mass of a sphere of uranium in a theoretical paper written in 1939 to be "of the order of tons".Peierls knew the importance of the size of the critical mass that would allow a chain reaction to take place and its practical significance. In the interior of a critical mass sphere, neutrons are spontaneously produced by the fissionable material. A very small portion of these neutrons are colliding with other nuclei, while a larger portion of the neutrons are escaping through the surface of the sphere. Peierls calculated the equilibrium of the system, where the number of neutrons being produced equalled the number escaping.Niels Bohr had theorised that the rare uranium-235 isotope, which makes up only about 0.7% of natural uranium, was primarily responsible for fission with fast neutrons, although this was not yet universally accepted. Frisch and Peierls were thus able to revise their initial estimate of critical mass needed for nuclear fission in uranium to be substantially less than previously assumed. They estimated a metallic sphere of uranium-235 with a radius of 2.1 centimetres (0.83 in) could suff. What were the full names of the two people who calculated the critical mass of uranium?, what is it ? +Answer: [/INST] \ No newline at end of file diff --git a/python/llm/dev/benchmark/all-in-one/prompt/QA/orca_99.txt b/python/llm/dev/benchmark/all-in-one/prompt/QA/orca_99.txt new file mode 100644 index 00000000000..8547a558d75 --- /dev/null +++ b/python/llm/dev/benchmark/all-in-one/prompt/QA/orca_99.txt @@ -0,0 +1,5 @@ +[INST] <> +You are an AI assistant. You will be given a task. You must generate a detailed and long answer. +<> + +You could go directly into the confessional (provided there's no one else in there or waiting outside), but sometimes it's nice to take a minute in the pew by yourself beforehand. You have this beautiful church probably almost all to yourself. Can you feel its energy resonating through you? Can you feel the majesty of the Lord's kingdom and how you're a part of it? Take a moment to kneel and pray with your head down and hands clasped together. Reflect on your faith and how you feel currently. Think about how you've been responding to God's call and how you've been living in the light of his love. When the priest is ready for you, of course. You'll probably see him there by his lonesome or someone else walk out just before you. Sit down either across from him or behind the screen -- it's totally up to you whether or not you prefer to remain anonymous. He won't treat you any differently either way. Make the sign of the cross upon his prompt, saying, "Bless me, Father, for I have sinned. It has been (blank) since my last confession." This is your standard, traditional phrasing. However, if you just sit down and say hello, that's fine, too. The priest knows what he's doing. The Byzantine Rite is a bit different. The priest may sit to your side and put his epitrachelion on your head. He may then also do the Prayer of Absolution. But the idea remains the exact same -- just go wherever he takes you. Once you sit down and you've made the sign of the cross, just sit back and follow the priest's lead. He'll ask you how long it's been since your last confession (if you don't voluntarily offer that information), how you are feeling, maybe how your faith is going, and then ask you what sins you would like to talk about with him and God. It's just a casual conversation! Do not fret. There is absolutely zero pressure on your part. Again, as long as you come there with the intention of leaving with a clean heart, you're more than welcome in the church. There is no wrong way to go about confession! This part is intimidating, but think about it this way: the priest you're talking to has probably heard just about everything before. Whatever you have to say will not blow his mind. So when he asks, start rattling them off, from the most serious to the least. If he asks any questions, answer them, but do not feel the need to go into detail. A simple, "I did so and so," will suffice. Your priest is going to be very understanding. If you don't remember the exact timeframe, that's fine. If you don't remember your motivation, that's fine. All your priest cares about is that you're being as honest as possible and that your heart is in the right place. He'll talk you through everything, possibly asking about your intentions, but mainly just letting you know that God loves you, sin and all. If he has any ideas to bring you closer to God, he may suggest them at this juncture. He's there to help, after all. He will then ask you to make an Act of Contrition. That goes like this: My God, I am sorry for my sins with all my heart.In choosing to do wrong and failing to do good,I have sinned against You whom I should loveabove all things. I firmly intend, with your help,to do penance, to sin no more, andto avoid whatever leads me to sin.Our Savior Jesus Christ suffered and died for us.In his name, my God, have mercy (If you are a Roman Catholic, your act of contrition will go like this: Oh my God, I am very sorry for having offended thee. I detest all of my sins because of thy just punishment. But most of all, because they offend you, my God, who is all good and deserving of all my love. I firmly resolve with the help of thy grace, to sin no more, and to avoid the near occasion of sin. Amen. Don't worry! It won't be anything huge. You may even walk away just having to say a few meaningful prayers. Take the absolution to heart --. What is a one-sentence summary of the following article? [/INST] \ No newline at end of file diff --git a/python/llm/dev/benchmark/all-in-one/prompt/1024.txt b/python/llm/dev/benchmark/all-in-one/prompt/continuation/1024.txt similarity index 100% rename from python/llm/dev/benchmark/all-in-one/prompt/1024.txt rename to python/llm/dev/benchmark/all-in-one/prompt/continuation/1024.txt diff --git a/python/llm/dev/benchmark/all-in-one/prompt/2048.txt b/python/llm/dev/benchmark/all-in-one/prompt/continuation/2048.txt similarity index 100% rename from python/llm/dev/benchmark/all-in-one/prompt/2048.txt rename to python/llm/dev/benchmark/all-in-one/prompt/continuation/2048.txt diff --git a/python/llm/dev/benchmark/all-in-one/prompt/256.txt b/python/llm/dev/benchmark/all-in-one/prompt/continuation/256.txt similarity index 100% rename from python/llm/dev/benchmark/all-in-one/prompt/256.txt rename to python/llm/dev/benchmark/all-in-one/prompt/continuation/256.txt diff --git a/python/llm/dev/benchmark/all-in-one/prompt/32.txt b/python/llm/dev/benchmark/all-in-one/prompt/continuation/32.txt similarity index 100% rename from python/llm/dev/benchmark/all-in-one/prompt/32.txt rename to python/llm/dev/benchmark/all-in-one/prompt/continuation/32.txt diff --git a/python/llm/dev/benchmark/all-in-one/prompt/8192.txt b/python/llm/dev/benchmark/all-in-one/prompt/continuation/8192.txt similarity index 100% rename from python/llm/dev/benchmark/all-in-one/prompt/8192.txt rename to python/llm/dev/benchmark/all-in-one/prompt/continuation/8192.txt diff --git a/python/llm/dev/benchmark/all-in-one/prompt/summarize/cnn_239.txt b/python/llm/dev/benchmark/all-in-one/prompt/summarize/cnn_239.txt new file mode 100644 index 00000000000..300c54f4518 --- /dev/null +++ b/python/llm/dev/benchmark/all-in-one/prompt/summarize/cnn_239.txt @@ -0,0 +1 @@ +(CNN)Jason Rezaian has sat in jail in Iran for nearly nine months. The Washington Post's bureau chief in Tehran was arrested in July on unspecified allegations. It took more than four months for a judge to hear charges against him. They remained publicly undisclosed until last week. The Iranian-American will be tried soon on espionage, Tehran's chief justice said. He is accused of economic spying, the Post reported, citing Iranian state media. The Washington Post did not mince words on the allegation. "Any charges of that sort would be absurd, the product of fertile and twisted imaginations," the paper said in a statement. The State Department also reacted with term "absurd" after hearing of reports in Iran's press about the charges. "If the reports are true, these charges are absurd, should be immediately dismissed and Jason should be immediately freed so that he can return to his family," the State Department official said. Since officers picked up Rezaian and his wife, Yeganeh Salehi, on July 22 at their home, the Post, the State Department and Rezaian's family have protested and called for his release. Salehi was released on bail in October. Rezaian was denied bail. And for months, he was denied access to proper legal representation, his family has said. Boxing great Muhammad Ali, also an American Muslim, appealed to Tehran last month to give Rezaian full access to legal representation and free him on bail. "To my knowledge, Jason is a man of peace and great faith, a man whose dedication and respect for the Iranian people is evident in his work," Ali said in a religiously worded statement. The journalist has also not been allowed to see visitors aside from his wife and has endured long interrogations, family members have said. In December, after a 10-hour hearing, Rezaian signed a paper to acknowledge that he understood the charges against him, the Post reported. Iran's human rights chief, Mohammad Javad Larijani, told news outlet France 24 last year that he hoped Rezaian's case would come to a positive conclusion. He said, "Let us hope that this fiasco will end on good terms." More on detained Americans . CNN's Sara Mazloumsaki and Azadeh Ansari contributed to this report. \ No newline at end of file diff --git a/python/llm/dev/benchmark/all-in-one/prompt/summarize/cnn_5618.txt b/python/llm/dev/benchmark/all-in-one/prompt/summarize/cnn_5618.txt new file mode 100644 index 00000000000..359a4193a97 --- /dev/null +++ b/python/llm/dev/benchmark/all-in-one/prompt/summarize/cnn_5618.txt @@ -0,0 +1 @@ +Concerns are raised about Labour's policy under shadow education secretary Tristram Hunt . The heads of some of Britain’s best state schools today warn of the dangers of a Labour government reversing radical education reforms. In a letter to the Daily Mail, 80 current and former leaders say there is clear evidence that academy-style freedoms are benefiting a generation of children. But they say Labour – and some senior Lib Dems – appear to be threatening to reimpose state controls. The letter, signed by the heads of good and outstanding autonomous schools, was backed yesterday by David Cameron. In it, they claim there is evidence that the most successful education systems benefit from schools with academy-style freedoms. They say such schools are more likely to be ranked ‘outstanding’ by Ofsted and more likely to improve. ‘Secondary schools which have converted to academy status outperform other schools – by a margin of almost 10 per cent,’ they wrote. But the heads expressed alarm at comments by Ed Miliband that Labour would reimpose ‘a proper local authority framework for all schools’. Senior Lib Dems were also accused of suggesting they no longer support freedom for acdemies, which are able to control pay, conditions and the curriculum. ‘This is not the time to stop something that is working to the benefit of so many children in schools,’ wrote the heads. Schools on the letter include Torquay Boys’ Grammar School, ranked in the top 100 for GCSE results this year. United Westminster Schools in London is also on the list, and includes Grey Coat Hospital – where Mr Cameron’s daughter Nancy starts this year. Tom Clark, chairman of Freedom and Autonomy for Schools National Association, which organised the letter, added: ‘Our only concern is that the autonomy which has worked well for pupils stays in place.’ Mr Cameron said yesterday: ‘Ed Miliband would put all this at risk.’ The letter, signed by the heads of good and outstanding autonomous schools, was backed by David Cameron . As the General Election campaign turned to education yesterday, the Prime Minister also attacked Labour yesterday for opposing the expansion of free schools – which are run by groups such as teachers, parents and charities and are outside of local authority control. He said the opposition’s antipathy appeared to be based on a concern that ‘if we set up a good new school, everyone will want to go there’. ‘Yes – that’s the whole point,’ he told the Mail. ‘How can you possibly be against an excellent school setting up another excellent school?’ He accused Labour of being ‘anti free schools’ for suggesting it wants to scrap the scheme. ‘It’s that mindset that says choice, freedom, responsibility, aspiration – that these are things to worry about rather than celebrate,’ he added. The education reforms, masterminded by Michael Gove, have been hailed by Mr Cameron as the most important ‘for a generation’. Ed Miliband has said Labour would ‘have a proper local authority framework for all schools’ We write as current and former headteachers and school leaders of good and outstanding autonomous schools across the country committed to the very best in state education. FASNA — the Freedom and Autonomy for Schools National Association — has helped build a consensus over 25 years which recognises that diversity and self-determination help shape outstanding education. We are firmly committed to the maintenance of current academy freedoms. International evidence shows that the most successful education systems benefit from schools with academy-style freedoms. The freedoms which have come with academy status have helped FASNA schools to improve education for children in our own schools and also enabled us to work better together to raise standards in other schools. The evidence shows that primary schools which have converted to academy status are doing better than other schools — they are more likely to be ranked ‘outstanding’ by Ofsted and are more likely to improve from ‘good’ to ‘outstanding’. Secondary schools which have converted to academy status out-perform other schools by a margin of almost 10 per cent. But as school leaders we are concerned that recent statements from Liberal Democrat and Labour politicians suggest they might not protect all the freedoms which schools and teachers now enjoy and which are helping to drive up standards across the board. Though Shadow Education Secretary Tristram Hunt said that Labour would not ‘go back to the old days of the local authority running all the schools’, Ed Miliband has said Labour would ‘have a proper local authority framework for all schools’. And a Liberal Democrat education spokesman told a recent FASNA conference that he could not support the freedom for schools to vary pay and conditions or to vary the curriculum, and he felt that schools needed local control. Any erosion of school freedoms through local authority or government regulation or overbearing ‘middle-tier’ structures will reduce the capacity of schools to perform well in the future. We call on all political leaders to guarantee that all current academy freedoms, including those relating to pay and conditions and the curriculum, will be maintained after the General Election. This is not the time to stop something that is working to the benefit of so many children in schools. The letter was signed by: . Tom Clark CBE, chair of FASNA, former principal George Spencer Academy, Nottingham . Martin Murphy, headteacher, Arden Academy, Knowle . Richard Vasey, headteacher, Ashfield School, Kirkby in Ashfield . Karen Land, chief finance officer, Aspire Academies Trust, Bovingdon . Steven Chamberlain, headteacher, Barnby Road Academy primary and nursery school, Newark . Corrina Beckett, school business manager, Barnby Road Academy primary and nursery school, Newark . Simon Ascroft, headteacher, Biddulph High School, Stoke on Trent . Steve Phillips, principal, Biggleswade Academy Trust, Biggleswade . Andrew Cliffe, headteacher, Brine Leas School, Nantwich . Nick Law, headteacher, Carre’s Grammar School, Sleaford . Duncan Gauld, headteacher, Christ Church Chorleywood C of E School, Chorleywood . Caroline Anderson, operations manager, Christ the King School, Nottingham . Tony Lamberton, headteacher, Christleton High School, Christleton . Dame Kate Dethridge, headteacher, Churchend Primary Academy, Reading . Tony Parker, director of school improvement, City Learning Trust, Stoke on Trent . Terry Molloy, headteacher, Claremont High School Academy, Harrow . Paul Evans, headteacher, Colyton Grammar School, Colyton . David Hermitt, chief executive officer, Congleton High School, Congleton . Seb Sales, headteacher, Connaught Junior School, Bagshot . Tony Hull, CEO, Evolution Academy Trust Costessey Junior School/Evolution Academy Trust, Norwich . Kieran Earley, headteacher, Devonport High School, Plymouth . Colin House, headteacher, Dove House School Academy Trust, Basingstoke . Sonia Case, headteacher, Dulwich Hamlet Junior School, Dulwich Village . Androulla Peek, executive headteacher, Fleetville Trust, St Albans . John Mirfin, vice chair of governors, Foxwood Academy, Nottingham . Chris Humphreys, headteacher, Foxwood Academy, Nottingham . Fraser Mitchell, principal, George Spencer Academy, Nottingham . Susan Jowett, executive principal, George Spencer Academy, Nottingham . Catharine Darnton, headteacher, Gillotts School, Henley on Thames . Pamela Birch, headteacher, Hambleton Primary Academy, Poulton Le Fylde . Drew Povey, headteacher, Harrop Fold School, Salford . Emma Yates, headteacher, Hayesfield Girls’ School, Bath . Carl Ward, executive principal/chief executive Haywood Academy/City Learning Trust, Stoke on Trent . Mark Knapton, principal, Healing School – A Science Academy, Grimsby . Michael Cook, headteacher, Heckmondwike Grammar School Academy, Heckmondwike . Steve Riches, headteacher, Highams Park School, London . Peter Nutkins, headteacher, Humphrey Perkins School, Barrow Upon Soar . Cathy Longhurst, headteacher, Mandeville Primary School, St Albans . Clare Askwith, acting headteacher, Monkton Infants School, South Shields . Stephen Morales, executive director, National Association of School Business Management . Mark Perry, headteacher, New Waltham Academy, Grimsby, . Peter Beaven, etired headteacher, executive board member of FASNA, Norton Hill School and Somervale School, Radstock . Steve Dunning, headteacher, Olney Infant Academy, Olney . Martin Shevill, ex-headteacher, Ossett Academy & Sixth Form, Ossett . Nick Daymond, headmaster, Parmiter’s School, Watford . David Wilson, chair of governors, Pax Christi Catholic Academy Trust, Nottingham . Kim Barrett, deputy head, Pelham Primary School, Bexleyheath . Sue Darbyshire, headteacher, Platt Bridge Community School, Wigan . Joan Binder, chair of governors and vice chair of FASNA Plume School, Maldon . David Stephenson, headteacher, Plume School, Maldon . Neville Coles, principal, Priory Community School, Weston-Super-Mare . Nick Edwards, business manager, Queen Elizabeth’s Grammar School, Blackburn . Neil Enright, headteacher Queen Elizabeth’s School, Barnet . Kathy Winrow, retired head, Ranelagh CE School, Bracknell . Keith Douglas, headteacher, Rickmansworth School, Rickmansworth . John Leigh, headteacher, Sandbach High School and Sixth Form College, Sandbach . Andrew Fielder, executive principal, Sandy Hill Academy, St Austell . Chris Crook, headteacher, Smallthorne Primary School, Stoke on Trent . Andrew Johnson, executive headteacher, Springwood High School, King’s Lynn . Simon Duggan, headmaster, St Anselm’s College, Wirral . Nigel Fisher, headteacher St Columba’s Catholic Boys’ School, Bexleyheath . Joan McCarthy, headteacher, St John Houghton Catholic Voluntary Academy, Ilkeston . Dame Sue Bourne, headteacher, The Avenue School Special Needs Academy, Reading . Gary Pratt, headteacher The Chafford School, Rainham . Iain Erskine, headteacher, The Fulbridge Academy, Peterborough . Sharon Bruton, chief executive officer, The Keys Federation Academy Trust, Hindley Green, Wigan . Martin Latham, retired headteacher, The Robinswood Academy Trust, Matson . David Hampson, chief executive Tollbar Academy, Grimsby . Jane Aukett, vice-chair of governors, Tollbar Academy, Grimsby . Rosemary Joyce, headteacher, Tonbridge Grammar School, Tonbridge . Peter Lawrence, headteacher, Torquay Boys’ Grammar School, Torquay . Roy Blackwell, clerk to foundation and governors, United Westminster Schools, London - Grey Coat Hospital Foundation . Denham Kite, headteacher, Victoria Dock Primary School, Kingston Upon Hull . Dianne Marshall, executive headteacher, Violet Way Academy, Burton-upon-Trent . Jayne Harrison, school business manager, Violet Way Academy, Burton-upon-Trent . Arthur Goldstraw, chair of governors, Violet Way Academy, Burton-upon-Trent . Lynne Fox, executive principal, Wade Deacon High School, Widnes . Pam Wright, chief executive officer, Wade Deacon Innovation Enterprise Academy, Widnes . Dame Helen Hyde, headteacher, Watford Grammar School for Girls, . Watford Stuart Beeley, headteacher, Wellington School, Altrincham . John Rowan, headteacher, Whirley Primary School, Macclesfield . Tanya Watson, headteacher William Tyndale Primary School, London . Linda Davis, principal, Wistaston Academy, Crewe . Peter Taylor, headteacher, Worth Primary School, Poynton . \ No newline at end of file diff --git a/python/llm/dev/benchmark/all-in-one/prompt/summarize/cnn_615.txt b/python/llm/dev/benchmark/all-in-one/prompt/summarize/cnn_615.txt new file mode 100644 index 00000000000..798ef1e6add --- /dev/null +++ b/python/llm/dev/benchmark/all-in-one/prompt/summarize/cnn_615.txt @@ -0,0 +1 @@ +(CNN)You might call her a watchdog, because this Boston area Doberman really has a thing for timepieces. She recently ate three. Last Thursday, Jeff Courcelle came home from work to find 5-year-old Mocha -- a pure bred fawn-colored Doberman pincher -- hovering over a pile of screws, metal pieces, three watch heads and some chewed leather straps. "My husband, who's the most calm person that I know, called me up and said, 'I'm not quite sure if I should panic,' " said Courcelle's wife, Michele Parkinson. The 80-pound Doberman, whom her owners describe as "more goofy than scary," had pulled down a basket of wrist wear from a shelf in their bedroom and eaten nearly all the contents. Parkinson knew that Mocha wouldn't be able to pass all that leather on her own. The couple took her to the MSPCA's Angell Animal Medical Center, a 24-hour emergency and specialty hospital, where a veterinarian performed a 3-hour endoscopy to explore the contents of her belly. Mocha was a repeat offender: Just last summer, she got very sick and had to have emergency stomach surgery after a piece of plastic from an orange juice container perforated her intestine. She had 28 staples down her belly and 10 inches of intestine removed, Parkinson said. Fortunately this time, the jewelry remains were still in Mocha's belly and had not made their way into the digestive tract. The X-ray, however, was disturbing. Parkinson and her husband were just expecting to see a couple metal pieces. "It just looked like a Christmas tree and I almost threw up," Parkinson said. The veterinarian removed "about a pound of leather straps and metal pieces and detritus" during the endoscopy, and let nature take its course for the remaining pieces, MSPCA spokesman Rob Halpin said. As of Friday, Mocha was no worse for wear. The hospital sees dozens of cases each week of dogs ingesting foreign objects, and is trained to look for the symptoms of blockages -- typically lethargy, not eating and vomiting, Halpin said. They once saw a golden retriever who had stopped eating and found 43 pacifiers in her belly. (Apparently she was taking them from babies at the park.) And there was the 100-pound bull mastiff who ate his owner's brie that was set out for a party -- along with the cheese knife. The night Mocha stayed in the hospital, a nervous Parkinson stayed awake reading stories about dogs ingesting watches and other objects. She found one article about a Newfoundland whose owner knew something was awry only when he heard an alarm go off from his dog's belly. "We've taken every imaginable thing that could fit down the gullet of a dog out with surgery," Halpin said. "There's some evolutionary traits that some dogs have that lead them to eat first and think later ... and some of them are so food motivated that anything with a scent could be associated with food, and they go for it." Mocha likes to suck on fleece blankets and has been known to eat rubber ear buds or hair elastics, but nothing like a pile of jewelry, Parkinson said. Her breeder wondered if the dog was acting out of anxiety. That day, Parkinson had left Mocha in a different apartment the couple owns that the dog wasn't as used to. The breeder told Parkinson that Dobermans are particularly known to get anxious and do these sort of things. "She had a dog that actually consumed her whole dog bed," Parkinson said. From now on, Parkinson said she will put Mocha in a crate if the dog will be staying somewhere new. Follow-up X-rays Monday on Mocha showed a few pieces of metal left, "but they were moving along" and the vet expected her to pass them naturally. Parkinson said Friday Mocha was "her playful, energetic, curious Doberman self." But now that she thinks of it, the timing of this whole incident is a little suspicious. "My husband was all excited about the new Apple watch, but couldn't justify a reason to purchase it since he owned three watches," she said. "I am convinced that he and Mocha joined forces here to destroy all of his current watches in order to make room for Apple's new watch." \ No newline at end of file diff --git a/python/llm/dev/benchmark/all-in-one/prompt/summarize/cnn_64.txt b/python/llm/dev/benchmark/all-in-one/prompt/summarize/cnn_64.txt new file mode 100644 index 00000000000..dba0f1e2339 --- /dev/null +++ b/python/llm/dev/benchmark/all-in-one/prompt/summarize/cnn_64.txt @@ -0,0 +1 @@ +(CNN)Authorities identified and charged a man Monday in connection with the discovery of human remains in a duffel bag in Cambridge, Massachusetts, over the weekend. Carlos Colina, 32, was arraigned on charges of assault and battery causing serious bodily injury and improper disposal of a body, the Middlesex District Attorney's Office said in a statement. "This was a gruesome discovery," said District Attorney Marian Ryan. "Detectives are continuing to analyze evidence and awaiting information from the Office of the Chief Medical Examiner so that we may determine if additional charges are warranted." Police were notified Saturday morning about a suspicious item along a walkway in Cambridge. Officers arrived at the scene, opened a duffel bag and found human remains. After that discovery, police say, a surveillance video led them to an apartment building, where more body parts were discovered in a common area. That location is near the Cambridge Police Department headquarters. The remains at both locations belonged to the same victim, identified Monday as Jonathan Camilien, 26. Camilien and Colina knew each other, according to authorities. The next scheduled hearing in the case is set for April 14. CNN's Andreas Preuss contributed to this report. \ No newline at end of file diff --git a/python/llm/dev/benchmark/all-in-one/prompt/summarize/cnn_824.txt b/python/llm/dev/benchmark/all-in-one/prompt/summarize/cnn_824.txt new file mode 100644 index 00000000000..8d8d2dfe5b4 --- /dev/null +++ b/python/llm/dev/benchmark/all-in-one/prompt/summarize/cnn_824.txt @@ -0,0 +1 @@ +Boston (CNN)After weeks of dramatic testimony, jurors are set to begin deliberations Tuesday in the trial of Dzhokhar Tsarnaev, who faces life in prison or the death penalty for working with his brother to explode bombs at the 2013 Boston Marathon. The defense and prosecution made closing arguments in the case on Monday. "The defendant brought terrorism into the backyards and main streets," Assistant U.S. Attorney Aloke Chakravarty said. "The defendant thought that his values were more important than the people around him. He wanted to awake the mujahideen, the holy warriors, so he chose Patriots Day, Marathon Monday," a time for families to gather and watch the marathon. Bomb survivors and victims' family members wiped away tears and comforted each other in court. Tsarnaev fidgeted at the defense table as he has done throughout the trial. Bill Richard, father of 8-year-old bomb victim Martin Richard, craned his neck to watch Tsarnaev as the prosecutor spoke. Dzhokhar Tsarnaev "chose a day when the eyes of the world would be on Boston," Chakravarty said. "He chose a day when there would be civilians on the sidewalks and he targeted those civilians: men, women and children." The lawyer waited a beat. "He wanted to terrorize this country. He wanted to punish America for what it was doing to his people." The prosecutor showed a picture of Dzhokhar Tsarnaev and his brother Tamerlan in the marathon crowd. The day of the bombings, Chakravarty said, "they felt they were soldiers. They were the mujahideen and they were bringing their battle to Boston." Tsarnaev, 21 years old, stands accused of 30 counts, including setting off weapons of mass destruction at a public event as an act of terrorism. Seventeen of those counts carry a sentence of death or life imprisonment. If Tsarnaev is found guilty of at least one of the 17 capital counts, the trial will proceed to a second phase, the so-called penalty phase. That part of the trial will include evidence of aggravating and mitigating factors, and the jury will be asked to weigh elements that make this crime especially heinous against details from Tsarnaev's background and mental health history that would weigh in his favor. Since testimony began on March 4, federal prosecutors have called 92 witnesses, and the defense just four. It seemed a mismatch from the start. "He was there," defense attorney Judy Clarke conceded as the trial opened, but the defense strategy always had been to focus on persuading the jury to spare Tsarnaev's life. The prosecution on Monday played a graphic video of the scene of the bombing that showed a chaotic, bloody scene with injured people everywhere. A child's piercing cries are heard. It's the son of Rebekah Gregory, who lost her leg. Then, another photo is displayed. This time jurors see Tsarnaev standing by a tree behind the family of little Martin Richard. "These children weren't innocent to him," the prosecutor said. "They were American. He knew what that bag was designed to do." Chakravarty quoted Bill Richard, Martin's father, who earlier testified, "I guess we were just unlucky that day." But luck had nothing to do with the Boston bombings, the prosecutor said. "This was a cold, intentional, terrorist act," he said. The brothers' acts that day were intended, he said, "to make a point. To tell America, 'We won't be terrorized by you anymore. We will terrorize you.' " The defense has maintained that Tsarnaev, who was 19 and flunking out of college at the University of Massachusetts, Dartmouth, fell under the sway of his older, more radicalized brother. "It was Tamerlan," defense attorney Clarke repeated during her closing argument Monday. "In the past few weeks we have come face to face with tragedy, suffering and grief in dimensions none of us could imagine," she said. "We've heard words, we've heard screams and we've heard cries. For this suffering and pain there is no excuse." She acknowledged her client participated in a "senseless act." But he was only following his brother, she insisted. "If not for Tamerlan, it would not have happened," Clarke argued. The older brother, a 26-year-old former Golden Gloves boxer, had hoped to wage jihad and his slacker younger brother was just along for the ride, the defense has said. During the 15-minute rebuttal period, prosecutor William Weinreb told jurors not to be distracted by the defense's "attempt to point the finger at somebody else." "There should be no doubt in your mind that the defendant and his brother are equally guilty," he said. They were "partners in crime." Weinreb pointed out that after the bombing Tsarnaev went to the grocery store. "Tamerlan Tsarnaev didn't turn his brother into a murderer. To shred the bodies of women and children with a homemade type of bomb, you have to be different from other people," the prosecutor said. If you are capable of such hate, such callousness that you can murder and maim 20 people and then drive to Whole Foods and buy some milk, can you really blame it on your brother?" From the start, prosecutors presented a compelling case in which the horrors of April 15 to 19, 2013, were vividly brought to life once again. They began with the stories of bombing survivors and first responders, who described acts of courage and compassion amid madness and chaos. The final moments of the three Boston Marathon spectators who died were recounted by the people who were by their sides. According to testimony, Tamerlan Tsarnaev set off a bomb made from a 6-quart pressure cooker, explosive powder from fireworks, duct tape, nails and BBs on Boylston Street near the finish line. That bomb, which exploded near Marathon Sports, claimed the life of Krystle Campbell, a 29-yeaer-old restaurant manager. Twelve seconds later, Dzhokhar Tsarnaev allegedly detonated a second, similar bomb outside the Forum restaurant, slightly more than a block away. That blast killed the boy, Martin Richard, and Lingzi Lu, 23, a graduate student from China. Chakravarty's voice grew soft Monday as he recalled the victims: . Richard's 69-pound body "was shattered, broken, eviscerated, burned. There wasn't a part of this boy's body that wasn't destroyed." Lu "received blast injuries all over her body. Her leg was torn open and she bled out." Krystle Campbell died in less than a minute from "massive blast injuries to her lower extremities. Parts of her body were shredded." Sean Collier, the MIT campus police officer killed three days after the bombings, "never had a chance." He was shot between the eyes. "They assassinated him." The brothers allegedly killed the 26-year-old officier for his service weapon but couldn't pry it loose from a safety holster. Dun Meng told the jury about his frightening 90 minutes with two carjackers, one who admitted being involved in the marathon bombing. He identified that person as Tamerlan Tsarnaev. Police fired 210 rounds at the brothers when they tracked a GPS device in Meng's stolen Mercedes and cornered them in Watertown, Massachusetts. Dzhokhar Tsarnaev struck Tamerlan, who was wounded, when he charged police in the car. Tamerlan died of his injuries. "Tamerlan wanted suicide by cop," the prosecutor said Monday. "He was ready for heaven. But the defendant had other plans." Dzhokhar ditched the stolen car and sought shelter in a dry-docked boat parked in a trailer in a backyard in Watertown. As he hid, he used a pencil to scrawl what prosecutors called a "manifesto," in which he said he was jealous of his brother for dying as a martyr and reaching paradise. He also lashed out at the United States for policies he said killed Muslims, writing, "I can't stand to see such evil go unpunished. We Muslims are one body, you hurt one you hurt us all." Federal prosecutors also presented evidence gleaned from searches of the brothers' computers, including militant literature written by top al Qaeda leaders. And they traced the purchase of the pressure cookers, ammunition and BBs, which appeared to have been made by Tamerlan. Boston Marathon Terror Attack Fast Facts . \ No newline at end of file diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index 7257a13686b..3b57b62176c 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -70,6 +70,42 @@ def run_model_in_thread(model, in_out, tokenizer, result, warm_up, num_beams, in result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time, actual_in_len, actual_out_len, load_time, model.peak_memory]) +def preprocess_prompt(tokenizer, in_len, task): + if task == 'summarize': + if in_len == 512: + input_str = open(f"prompt/summarize/cnn_239.txt", 'r').read() + elif in_len == 1024: + input_str = open(f"prompt/summarize/cnn_615.txt", 'r').read() + elif in_len == 2048: + input_str = open(f"prompt/summarize/cnn_824.txt", 'r').read() + elif in_len <= 256: + input_str = open(f"prompt/summarize/cnn_64.txt", 'r').read() + else: + input_str = open(f"prompt/summarize/cnn_5618.txt", 'r').read() + question = "Can you please summarize this article?" + prompt_format = "[INST] Article:```{}``` \n\n Question: {} \n\n [/INST]" + special_tokens_len = len(tokenizer.encode(prompt_format.format("", question), add_special_tokens=False)) + max_article_len = in_len - special_tokens_len + article_ids = tokenizer.encode(input_str, add_special_tokens=False) + if len(article_ids) > max_article_len: + article_ids = article_ids[:max_article_len] + truncated_article_text = tokenizer.decode(article_ids, skip_special_tokens=True) + final_prompt = prompt_format.format(truncated_article_text, question) + input_ids = tokenizer.encode(final_prompt, return_tensors="pt", truncation=True, max_length=in_len) + elif task == 'QA': + if in_len == 512: + input_str = open(f"prompt/QA/orca_776.txt", 'r').read() + elif in_len == 1024: + input_str = open(f"prompt/QA/orca_99.txt", 'r').read() + elif in_len == 2048: + input_str = open(f"prompt/QA/orca_401.txt", 'r').read() + elif in_len == 4096: + input_str = open(f"prompt/QA/orca_497.txt", 'r').read() + else: + raise ValueError("No corresponding prompt available now, will be added later.") + input_ids = tokenizer.encode(input_str, return_tensors="pt") + return input_ids + def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, num_trials=3, num_beams=1, low_bit='sym_int4', cpu_embedding=False, batch_size=1, streaming=False, use_fp16_torch_dtype=False, n_gpu=2): # TODO: make a parameter result= {} @@ -174,7 +210,7 @@ def run_native_int4(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - input_str = open(f"prompt/{in_len}.txt", 'r').read() + input_str = open(f"prompt/continuation/{in_len}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. n_ctx = in_len + out_len if in_len + out_len > 512 else 512 @@ -236,7 +272,7 @@ def run_transformer_int4(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -301,7 +337,7 @@ def run_pytorch_autocast_bf16(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -372,7 +408,7 @@ def run_optimize_model(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -463,28 +499,22 @@ def run_transformer_int4_gpu(repo_id, in_out_len = in_out.split("-") in_len = int(in_out_len[0]) out_len = int(in_out_len[1]) - # As different tokenizer has different encodings, - # in_len.txt maybe shorter than we need, - # use much longer context to make sure input length - test_length = min(in_len*2, 8192) - while test_length not in [32, 256, 1024, 2048, 8192] and test_length < 8192: - test_length = test_length * 2 - # For the sequence length not in [32, 256, 1024, 2048, 8192], it will be truncated from 8192.txt. - test_length = min(test_length, 8192) - input_str = open(f"prompt/{test_length}.txt", 'r').read() - if lookahead: - question = "Can you please summarize this article?" - question_tokens = tokenizer.encode(question, return_tensors="pt") - max_article_len = in_len - question_tokens.size(1) - article_ids = tokenizer.encode(input_str, return_tensors="pt") - if article_ids.size(1) > max_article_len: - article_ids = article_ids[:, :max_article_len] - input_ids = torch.cat((article_ids, question_tokens), dim=1) - else: + if not lookahead or conf['task'] == 'continuation': + # As different tokenizer has different encodings, + # in_len.txt maybe shorter than we need, + # use much longer context to make sure input length + test_length = min(in_len*2, 8192) + while test_length not in [32, 256, 1024, 2048, 8192] and test_length < 8192: + test_length = test_length * 2 + # For the sequence length not in [32, 256, 1024, 2048, 8192], it will be truncated from 8192.txt. + test_length = min(test_length, 8192) + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") input_ids = input_ids[:, :in_len] + elif conf['task'] == 'summarize' or conf['task'] == 'QA': + input_ids = preprocess_prompt(tokenizer, in_len, conf['task']) true_str = tokenizer.batch_decode(input_ids)[0] input_list = [true_str] * batch_size input_ids = tokenizer(input_list, return_tensors="pt").input_ids.to('xpu') @@ -567,7 +597,7 @@ def run_optimize_model_gpu(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -639,7 +669,7 @@ def run_ipex_fp16_gpu(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -717,7 +747,7 @@ def run_bigdl_fp16_gpu(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -809,7 +839,7 @@ def run_deepspeed_transformer_int4_cpu(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -896,7 +926,7 @@ def run_transformer_int4_gpu_win(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1005,7 +1035,7 @@ def run_transformer_int4_fp16_gpu_win(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1109,7 +1139,7 @@ def run_transformer_int4_loadlowbit_gpu_win(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1198,7 +1228,7 @@ def run_transformer_autocast_bf16( repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1269,7 +1299,7 @@ def run_bigdl_ipex_bf16(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1339,7 +1369,7 @@ def run_bigdl_ipex_int4(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1409,7 +1439,7 @@ def run_bigdl_ipex_int8(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1518,9 +1548,9 @@ def get_int_from_env(env_keys, default): # in_len.txt maybe shorter than we need, # use much longer context to make sure input length test_length = min(in_len*2, 8192) - while test_length not in [32, 256, 1024, 2048, 8192]: + while test_length not in [32, 256, 1024, 2048, 8192] and test_length < 8192: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1597,7 +1627,7 @@ def run_speculative_cpu(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1675,7 +1705,7 @@ def run_speculative_gpu(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1795,7 +1825,7 @@ def run_pipeline_parallel_gpu(repo_id, test_length = min(in_len*2, 8192) while test_length not in [32, 256, 1024, 2048, 8192]: test_length = test_length * 2 - input_str = open(f"prompt/{test_length}.txt", 'r').read() + input_str = open(f"prompt/continuation/{test_length}.txt", 'r').read() # As different tokenizer has different encodings, # slice the input_ids to ensure the prompt length is required length. input_ids = tokenizer.encode(input_str, return_tensors="pt") @@ -1827,6 +1857,7 @@ def run_pipeline_parallel_gpu(repo_id, if __name__ == '__main__': from omegaconf import OmegaConf + global conf conf = OmegaConf.load(f'{current_dir}/config.yaml') today = date.today() if 'exclude' in conf: @@ -1844,17 +1875,22 @@ def run_pipeline_parallel_gpu(repo_id, import pandas as pd for api in conf.test_api: global csv_name - csv_name = f'{current_dir}/{api}-results-{today}.csv' - for model in conf.repo_id: - in_out_pairs = conf['in_out_pairs'].copy() - if excludes: - for in_out in conf['in_out_pairs']: - model_id_input = model + ':' + in_out.split('-')[0] - model_id_input_batch_size = model_id_input + ':' + str(conf['batch_size']) - if model_id_input in excludes or model_id_input_batch_size in excludes: - in_out_pairs.remove(in_out) - run_model(model, api, in_out_pairs, conf['local_model_hub'], conf['warm_up'], conf['num_trials'], conf['num_beams'], - conf['low_bit'], conf['cpu_embedding'], conf['batch_size'], streaming, use_fp16_torch_dtype, n_gpu) + csv_name = f'{current_dir}/{api}-results-{today}.csv' + if not OmegaConf.is_list(conf["batch_size"]): + batch_list = [conf["batch_size"]] + else: + batch_list = conf["batch_size"] + for batch_size in batch_list: + for model in conf.repo_id: + in_out_pairs = conf['in_out_pairs'].copy() + if excludes: + for in_out in conf['in_out_pairs']: + model_id_input = model + ':' + in_out.split('-')[0] + model_id_input_batch_size = model_id_input + ':' + str(batch_size) + if model_id_input in excludes or model_id_input_batch_size in excludes: + in_out_pairs.remove(in_out) + run_model(model, api, in_out_pairs, conf['local_model_hub'], conf['warm_up'], conf['num_trials'], conf['num_beams'], + conf['low_bit'], conf['cpu_embedding'], batch_size, streaming, use_fp16_torch_dtype, n_gpu) df = pd.DataFrame(results, columns=['model', '1st token avg latency (ms)', '2+ avg latency (ms/token)', 'encoder time (ms)', 'input/output tokens', 'batch_size', 'actual input/output tokens', 'num_beams', 'low_bit', 'cpu_embedding', 'model loading time (s)', 'peak mem (GB)', 'streaming', 'use_fp16_torch_dtype']) diff --git a/python/llm/dev/benchmark/perplexity/run_wikitext.py b/python/llm/dev/benchmark/perplexity/run_wikitext.py index a5b7dea3e7a..531ffff51e5 100644 --- a/python/llm/dev/benchmark/perplexity/run_wikitext.py +++ b/python/llm/dev/benchmark/perplexity/run_wikitext.py @@ -44,6 +44,7 @@ use_cache=args.use_cache, trust_remote_code=True) model = model.half() model = model.to(args.device) +model = model.eval() with open(args.data_path, "rb") as f: data = f.read() diff --git a/python/llm/example/CPU/Applications/streaming-llm/streaming_llm/utils.py b/python/llm/example/CPU/Applications/streaming-llm/streaming_llm/utils.py index 61ca1b4a9d9..87cefc3160d 100644 --- a/python/llm/example/CPU/Applications/streaming-llm/streaming_llm/utils.py +++ b/python/llm/example/CPU/Applications/streaming-llm/streaming_llm/utils.py @@ -51,7 +51,6 @@ # code change to import from IPEX-LLM API instead of using transformers API from ipex_llm.transformers import AutoModelForCausalLM from transformers import LlamaTokenizer -import intel_extension_for_pytorch as ipex def load(model_name_or_path): diff --git a/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py b/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py index 77383dd8591..05803b75de7 100644 --- a/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py +++ b/python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py @@ -47,7 +47,6 @@ import deepspeed from ipex_llm import optimize_model import torch -import intel_extension_for_pytorch as ipex import time import argparse diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md index 9d82496ea7d..74c5c7871bd 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md @@ -21,7 +21,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y In the example [generate.py](./generate.py), we show a basic use case to load a GGUF LLaMA2 model into `ipex-llm` using `from_gguf()` API, with IPEX-LLM optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/README.md index 93f07a06702..e38cbf6f556 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/README.md @@ -12,7 +12,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a Aquila model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2/README.md index 730e7d4795c..fb7d16872fe 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2/README.md @@ -12,7 +12,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a Aquila2 model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/README.md index 1d7006b3617..2813602abaf 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/README.md @@ -12,7 +12,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a ChatGLM model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codegemma/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codegemma/README.md index 76b96e9ae2a..500e4027652 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codegemma/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codegemma/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a CodeGemma model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codeshell/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codeshell/README.md index ea5fd312370..b859e801463 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codeshell/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/codeshell/README.md @@ -12,7 +12,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a CodeShell model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek-moe/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek-moe/README.md index ff3fc050c54..0ad5949daf8 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek-moe/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek-moe/README.md @@ -12,7 +12,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a DeepSeek-MoE model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/distil-whisper/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/distil-whisper/README.md index 4b57416c4d0..217a149daed 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/distil-whisper/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/distil-whisper/README.md @@ -8,7 +8,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Recognize Tokens using `generate()` API In the example [recognize.py](./recognize.py), we show a basic use case for a Distil-Whisper model to conduct transcription using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/flan-t5/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/flan-t5/README.md index de95d858bd5..15a4b22b431 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/flan-t5/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/flan-t5/README.md @@ -8,7 +8,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a Flan-t5 model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/fuyu/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/fuyu/README.md index 49942b010fc..eeaa969444b 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/fuyu/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/fuyu/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for an Fuyu model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm-xcomposer/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm-xcomposer/README.md index 3deb7bb21bf..9c1b02b20ee 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm-xcomposer/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm-xcomposer/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Multi-turn chat centered around an image using `chat()` API In the example [chat.py](./chat.py), we show a basic use case for an InternLM_XComposer model to start a multi-turn chat centered around an image using `chat()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/minicpm/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/minicpm/README.md new file mode 100644 index 00000000000..34a4aced900 --- /dev/null +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/minicpm/README.md @@ -0,0 +1,71 @@ +# MiniCPM +In this directory, you will find examples on how you could apply IPEX-LLM INT4 optimizations on MiniCPM models. For illustration purposes, we utilize the [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) as a reference MiniCPM model. + +## 0. Requirements +To run these examples with IPEX-LLM, we have some recommended requirements for your machine, please refer to [here](../README.md#recommended-requirements) for more information. + +## Example: Predict Tokens using `generate()` API +In the example [generate.py](./generate.py), we show a basic use case for a MiniCPM model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. +### 1. Install +We suggest using conda to manage environment: + +On Linux: + +```bash +conda create -n llm python=3.11 +conda activate llm + +# install ipex-llm with 'all' option +pip install --pre --upgrade ipex-llm[all] --extra-index-url https://download.pytorch.org/whl/cpu +``` +On Windows: + +```cmd +conda create -n llm python=3.11 +conda activate llm + +pip install --pre --upgrade ipex-llm[all] +``` + +### 2. Run +``` +python ./generate.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH --prompt PROMPT --n-predict N_PREDICT +``` + +Arguments info: +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the MiniCPM model (e.g. `openbmb/MiniCPM-2B-sft-bf16`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'openbmb/MiniCPM-2B-sft-bf16'`. +- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is AI?'`. +- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. + +> **Note**: When loading the model in 4-bit, IPEX-LLM converts linear layers in the model into INT4 format. In theory, a *X*B model saved in 16-bit will requires approximately 2*X* GB of memory for loading, and ~0.5*X* GB memory for further inference. +> +> Please select the appropriate size of the MiniCPM model based on the capabilities of your machine. + +#### 2.1 Client +On client Windows machine, it is recommended to run directly with full utilization of all cores: +```cmd +python ./generate.py +``` + +#### 2.2 Server +For optimal performance on server, it is recommended to set several environment variables (refer to [here](../README.md#best-known-configuration-on-linux) for more information), and run the example with all the physical cores of a single socket. + +E.g. on Linux, +```bash +# set IPEX-LLM env variables +source ipex-llm-init + +# e.g. for a server with 48 cores per socket +export OMP_NUM_THREADS=48 +numactl -C 0-47 -m 0 python ./generate.py +``` + +#### 2.3 Sample Output +#### [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) +```log +Inference time: xxxx s +-------------------- Prompt -------------------- +<用户>what is AI? +-------------------- Output -------------------- + <用户>what is AI? AI, or Artificial Intelligence, refers to the simulation of human intelligence in machines that are programmed to think and learn like humans. It is a broad field of computer +``` \ No newline at end of file diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/minicpm/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/minicpm/generate.py new file mode 100644 index 00000000000..8bdb2fcb09c --- /dev/null +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/minicpm/generate.py @@ -0,0 +1,72 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch +import time +import argparse + +from ipex_llm.transformers import AutoModelForCausalLM +from transformers import AutoTokenizer + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for MiniCPM model') + parser.add_argument('--repo-id-or-model-path', type=str, default="openbmb/MiniCPM-2B-sft-bf16", + help='The huggingface repo id for the MiniCPM model to be downloaded' + ', or the path to the huggingface checkpoint folder') + parser.add_argument('--prompt', type=str, default="What is AI?", + help='Prompt to infer') + parser.add_argument('--n-predict', type=int, default=32, + help='Max tokens to predict') + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + + # Load model in 4 bit, + # which convert the relevant layers in the model into INT4 format + model = AutoModelForCausalLM.from_pretrained(model_path, + load_in_4bit=True, + optimize_model=True, + trust_remote_code=True, + use_cache=True) + + # Load tokenizer + tokenizer = AutoTokenizer.from_pretrained(model_path, + trust_remote_code=True) + + # Generate predicted tokens + with torch.inference_mode(): + + # here the prompt formatting refers to: https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16/blob/79fbb1db171e6d8bf77cdb0a94076a43003abd9e/modeling_minicpm.py#L1320 + chat = [ + { "role": "user", "content": args.prompt }, + ] + prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False) + input_ids = tokenizer.encode(prompt, return_tensors="pt") + + # start inference + st = time.time() + + output = model.generate(input_ids, + do_sample=False, + max_new_tokens=args.n_predict) + end = time.time() + output_str = tokenizer.decode(output[0], skip_special_tokens=False) + print(f'Inference time: {end-st} s') + print('-'*20, 'Prompt', '-'*20) + print(prompt) + print('-'*20, 'Output', '-'*20) + print(output_str) diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral/README.md index 78abbe27514..49be918ab16 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral/README.md @@ -9,7 +9,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a Mistral model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral/README.md index 6514817e5ca..08b4f064d17 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral/README.md @@ -9,7 +9,7 @@ To run these examples with IPEX-LLM on Intel CPUs, we have some recommended requ ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a Mixtral model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations on Intel CPUs. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-1_5/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-1_5/README.md index e3c32c740d9..e8ef3b3887e 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-1_5/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-1_5/README.md @@ -12,7 +12,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a phi-1_5 model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/README.md index b211fd9545a..0e47ebedd9c 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/README.md @@ -12,7 +12,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a phi-2 model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: @@ -25,6 +25,7 @@ conda activate llm # install the latest ipex-llm nightly build with 'all' option pip install --pre --upgrade ipex-llm[all] --extra-index-url https://download.pytorch.org/whl/cpu pip install einops # additional package required for phi-2 to conduct generation +pip install transformers==4.37.0 ``` On Windows: @@ -34,6 +35,7 @@ conda activate llm pip install --pre --upgrade ipex-llm[all] pip install einops +pip install transformers==4.37.0 ``` ### 2. Run diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-3/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-3/README.md index ff9f870be7d..b3b7dc5f5b2 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-3/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-3/README.md @@ -12,7 +12,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a phi-3 model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: @@ -76,6 +76,7 @@ In the example, several arguments can be passed to satisfy your requirements: #### 2.4 Sample Output #### [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) ```log +Inference time: xxxx s -------------------- Prompt -------------------- <|user|> What is AI?<|end|> diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phixtral/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phixtral/README.md index 76563a99033..1e0d919829a 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phixtral/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/phixtral/README.md @@ -12,7 +12,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a phixtral model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/README.md index 777c60df3f3..7dc3dedc5cb 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Multimodal chat using `chat()` API In the example [chat.py](./chat.py), we show a basic use case for a Qwen-VL model to start a multimodal chat using `chat()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen2/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen2/README.md new file mode 100644 index 00000000000..1de09015c17 --- /dev/null +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen2/README.md @@ -0,0 +1,83 @@ +# Qwen2 + +In this directory, you will find examples on how you could apply IPEX-LLM INT4 optimizations on Qwen2 models. For illustration purposes, we utilize the [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct) as a reference Qwen2 model. + +## 0. Requirements +To run these examples with IPEX-LLM, we have some recommended requirements for your machine, please refer to [here](../README.md#recommended-requirements) for more information. + +## Example: Predict Tokens using `generate()` API +In the example [generate.py](./generate.py), we show a basic use case for a Qwen model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. +### 1. Install +We suggest using conda to manage environment: + +On Linux: + +```bash +conda create -n llm python=3.11 +conda activate llm + +# install ipex-llm with 'all' option +pip install --pre --upgrade ipex-llm[all] --extra-index-url https://download.pytorch.org/whl/cpu +pip install transformers==4.37.0 # install the transformers which support Qwen2 +``` + +On Windows: + +```cmd +conda create -n llm python=3.11 +conda activate llm + +pip install --pre --upgrade ipex-llm[all] +pip install transformers==4.37.0 +``` + +### 2. Run +``` +python ./generate.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH --prompt PROMPT --n-predict N_PREDICT +``` + +Arguments info: +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the Qwen model to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'Qwen/Qwen2-7B-Instruct'`. +- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'AI是什么?'`. +- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. + +> **Note**: When loading the model in 4-bit, IPEX-LLM converts linear layers in the model into INT4 format. In theory, a *X*B model saved in 16-bit will requires approximately 2*X* GB of memory for loading, and ~0.5*X* GB memory for further inference. +> +> Please select the appropriate size of the Qwen model based on the capabilities of your machine. + +#### 2.1 Client +On client Windows machine, it is recommended to run directly with full utilization of all cores: +```cmd +python ./generate.py +``` + +#### 2.2 Server +For optimal performance on server, it is recommended to set several environment variables (refer to [here](../README.md#best-known-configuration-on-linux) for more information), and run the example with all the physical cores of a single socket. + +E.g. on Linux, +```bash +# set IPEX-LLM env variables +source ipex-llm-init + +# e.g. for a server with 48 cores per socket +export OMP_NUM_THREADS=48 +numactl -C 0-47 -m 0 python ./generate.py +``` + +#### 2.3 Sample Output +##### [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct) +```log +Inference time: xxxx s +-------------------- Prompt -------------------- +AI是什么? +-------------------- Output -------------------- +AI,即人工智能(Artificial Intelligence),是一门研究、开发用于模拟、延伸和扩展人类智能的理论、方法、技术及应用系统的学科 +``` + +```log +Inference time: xxxx s +-------------------- Prompt -------------------- +What is AI? +-------------------- Output -------------------- +AI stands for Artificial Intelligence, which is the simulation of human intelligence in machines that are programmed to think and learn like humans and mimic their actions. The term may +``` \ No newline at end of file diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen2/generate.py b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen2/generate.py new file mode 100644 index 00000000000..90626539427 --- /dev/null +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen2/generate.py @@ -0,0 +1,80 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch +import time +import argparse +import numpy as np + +from transformers import AutoTokenizer + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Qwen2-7B-Instruct') + parser.add_argument('--repo-id-or-model-path', type=str, default="Qwen/Qwen2-7B-Instruct", + help='The huggingface repo id for the Qwen2 model to be downloaded' + ', or the path to the huggingface checkpoint folder') + parser.add_argument('--prompt', type=str, default="AI是什么?", + help='Prompt to infer') + parser.add_argument('--n-predict', type=int, default=32, + help='Max tokens to predict') + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + + + from ipex_llm.transformers import AutoModelForCausalLM + model = AutoModelForCausalLM.from_pretrained(model_path, + load_in_4bit=True, + optimize_model=True, + trust_remote_code=True, + use_cache=True) + + # Load tokenizer + tokenizer = AutoTokenizer.from_pretrained(model_path, + trust_remote_code=True) + + prompt = args.prompt + + # Generate predicted tokens + with torch.inference_mode(): + # The following code for generation is adapted from https://huggingface.co/Qwen/Qwen2-7B-Instruct#quickstart + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": prompt} + ] + text = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True + ) + model_inputs = tokenizer([text], return_tensors="pt") + st = time.time() + generated_ids = model.generate( + model_inputs.input_ids, + max_new_tokens=args.n_predict + ) + end = time.time() + generated_ids = [ + output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) + ] + + response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] + print(f'Inference time: {end-st} s') + print('-'*20, 'Prompt', '-'*20) + print(prompt) + print('-'*20, 'Output', '-'*20) + print(response) diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/replit/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/replit/README.md index 7a973a18446..558ee244acb 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/replit/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/replit/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for an Replit model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/stablelm/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/stablelm/README.md index 64436a4ac83..08aed2a20e3 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/stablelm/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/stablelm/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a StableLM model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yi/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yi/README.md index b3ea29d4457..70c9d662f92 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yi/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yi/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for an Yi model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yuan2/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yuan2/README.md index d39cdf5bf52..fb5c358b502 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yuan2/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/yuan2/README.md @@ -9,7 +9,7 @@ In addition, you need to modify some files in Yuan2-2B-hf folder, since Flash at ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for an Yuan2 model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/ziya/README.md b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/ziya/README.md index ff56fba8783..1582f794d97 100644 --- a/python/llm/example/CPU/HF-Transformers-AutoModels/Model/ziya/README.md +++ b/python/llm/example/CPU/HF-Transformers-AutoModels/Model/ziya/README.md @@ -12,7 +12,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a Ziya model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/Native-Models/native_int4_pipeline.py b/python/llm/example/CPU/Native-Models/native_int4_pipeline.py index e576cfd1c85..c138e7dd495 100644 --- a/python/llm/example/CPU/Native-Models/native_int4_pipeline.py +++ b/python/llm/example/CPU/Native-Models/native_int4_pipeline.py @@ -36,8 +36,7 @@ def load(model_path, model_family, n_threads): "llama": LlamaForCausalLM, "gptneox": GptneoxForCausalLM, "bloom": BloomForCausalLM, - "starcoder": StarcoderForCausalLM, - "chatglm": ChatGLMForCausalLM + "starcoder": StarcoderForCausalLM } if model_family in model_family_to_class: @@ -55,7 +54,7 @@ def load(model_path, model_family, n_threads): def inference(llm, repo_id_or_model_path, model_family, prompt): - if model_family in ['llama', 'gptneox', 'bloom', 'starcoder', 'chatglm']: + if model_family in ['llama', 'gptneox', 'bloom', 'starcoder']: # ------ Option 1: Use IPEX-LLM based tokenizer print('-'*20, ' IPEX-LLM based tokenizer ', '-'*20) st = time.time() @@ -109,9 +108,9 @@ def main(): parser.add_argument('--thread-num', type=int, default=2, required=True, help='Number of threads to use for inference') parser.add_argument('--model-family', type=str, default='llama', required=True, - choices=["llama", "llama2", "bloom", "gptneox", "starcoder", "chatglm"], + choices=["llama", "llama2", "bloom", "gptneox", "starcoder"], help="The model family of the large language model (supported option: 'llama', 'llama2', " - "'gptneox', 'bloom', 'starcoder', 'chatglm')") + "'gptneox', 'bloom', 'starcoder')") parser.add_argument('--repo-id-or-model-path', type=str, required=True, help='The path to the huggingface checkpoint folder') parser.add_argument('--prompt', type=str, default='Once upon a time, there existed a little girl who liked to have adventures. ', diff --git a/python/llm/example/CPU/PyTorch-Models/Model/aquila2/README.md b/python/llm/example/CPU/PyTorch-Models/Model/aquila2/README.md index 50526cb78bc..0bbefe31b41 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/aquila2/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/aquila2/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a Aquila2 model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/bark/README.md b/python/llm/example/CPU/PyTorch-Models/Model/bark/README.md index e014a3e1524..971c6a759a6 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/bark/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/bark/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Synthesize speech with the given input text In the example [synthesize_speech.py](./synthesize_speech.py), we show a basic use case for Bark model to synthesize speech based on the given text, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/bert/README.md b/python/llm/example/CPU/PyTorch-Models/Model/bert/README.md index bf9eee36715..66ed443807f 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/bert/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/bert/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Extract the feature of given text In the example [extract_feature.py](./extract_feature.py), we show a basic use case for a BERT model to extract the feature of given text, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/bluelm/README.md b/python/llm/example/CPU/PyTorch-Models/Model/bluelm/README.md index e6f33415083..98495b3c644 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/bluelm/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/bluelm/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a BlueLM model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/chatglm/README.md b/python/llm/example/CPU/PyTorch-Models/Model/chatglm/README.md index a387980dc44..b6b53991ffc 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/chatglm/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/chatglm/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a ChatGLM model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/chatglm3/README.md b/python/llm/example/CPU/PyTorch-Models/Model/chatglm3/README.md index 736e3ce79b6..43e18acf1f0 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/chatglm3/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/chatglm3/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a ChatGLM3 model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/codegemma/README.md b/python/llm/example/CPU/PyTorch-Models/Model/codegemma/README.md index d0edbf9465b..81831376da7 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/codegemma/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/codegemma/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a CodeGemma model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/codellama/README.md b/python/llm/example/CPU/PyTorch-Models/Model/codellama/README.md index 8504713b378..7705f2d78ca 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/codellama/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/codellama/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a CodeLlama model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/codeshell/README.md b/python/llm/example/CPU/PyTorch-Models/Model/codeshell/README.md index 2b9b9c1dec1..2ad00dad793 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/codeshell/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/codeshell/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a CodeShell model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/deciLM-7b/README.md b/python/llm/example/CPU/PyTorch-Models/Model/deciLM-7b/README.md index 62c89a57bad..7ed2d846f49 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/deciLM-7b/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/deciLM-7b/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a DeciLM-7B model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/deepseek-moe/README.md b/python/llm/example/CPU/PyTorch-Models/Model/deepseek-moe/README.md index e0ed005931d..f7e2035d7f2 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/deepseek-moe/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/deepseek-moe/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a deepseek-moe model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/deepseek/README.md b/python/llm/example/CPU/PyTorch-Models/Model/deepseek/README.md index 88315963df5..0de0dc8d3c1 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/deepseek/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/deepseek/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a Deepseek model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/distil-whisper/README.md b/python/llm/example/CPU/PyTorch-Models/Model/distil-whisper/README.md index 2ab17e17c83..35166a6d9d2 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/distil-whisper/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/distil-whisper/README.md @@ -8,7 +8,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Recognize Tokens using `generate()` API In the example [recognize.py](./recognize.py), we show a basic use case for a Distil-Whisper model to conduct transcription using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/flan-t5/README.md b/python/llm/example/CPU/PyTorch-Models/Model/flan-t5/README.md index de95d858bd5..15a4b22b431 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/flan-t5/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/flan-t5/README.md @@ -8,7 +8,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a Flan-t5 model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/fuyu/README.md b/python/llm/example/CPU/PyTorch-Models/Model/fuyu/README.md index 84de78355bf..ee9c40431fd 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/fuyu/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/fuyu/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for an Fuyu model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/internlm-xcomposer/README.md b/python/llm/example/CPU/PyTorch-Models/Model/internlm-xcomposer/README.md index bc27022fd77..1f0775e17e6 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/internlm-xcomposer/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/internlm-xcomposer/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Multi-turn chat centered around an image using `chat()` API In the example [chat.py](./chat.py), we show a basic use case for an InternLM_XComposer model to start a multi-turn chat centered around an image using `chat()` API, with IPEX-LLM 'optimize_model' API. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/internlm2/README.md b/python/llm/example/CPU/PyTorch-Models/Model/internlm2/README.md index 024132700cb..c3588a15c09 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/internlm2/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/internlm2/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a InternLM2 model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/llama2/README.md b/python/llm/example/CPU/PyTorch-Models/Model/llama2/README.md index 2d56b03fb68..bd9083cc6da 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/llama2/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/llama2/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a Llama2 model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/llama3/README.md b/python/llm/example/CPU/PyTorch-Models/Model/llama3/README.md index f50a7ebf5ae..ceadb191ab7 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/llama3/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/llama3/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a Llama3 model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: @@ -66,7 +66,7 @@ In the example, several arguments can be passed to satisfy your requirements: - `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is AI?'`. - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. -#### 2.3 Sample Output +#### 2.4 Sample Output #### [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) ```log Inference time: xxxx s @@ -84,4 +84,4 @@ What is AI?<|eot_id|><|start_header_id|>assistant<|end_header_id|> Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that would typically require human intelligence, such as: 1. Learning: AI -``` +``` \ No newline at end of file diff --git a/python/llm/example/CPU/PyTorch-Models/Model/llava/README.md b/python/llm/example/CPU/PyTorch-Models/Model/llava/README.md index 08ea2c0e804..aa44bf44e44 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/llava/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/llava/README.md @@ -8,7 +8,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Multi-turn chat centered around an image using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a LLaVA model to start a multi-turn chat centered around an image using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/mamba/README.md b/python/llm/example/CPU/PyTorch-Models/Model/mamba/README.md index bd47c7b2f90..37a4594989a 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/mamba/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/mamba/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a Mamba model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/minicpm/README.md b/python/llm/example/CPU/PyTorch-Models/Model/minicpm/README.md new file mode 100644 index 00000000000..6a14ffc2212 --- /dev/null +++ b/python/llm/example/CPU/PyTorch-Models/Model/minicpm/README.md @@ -0,0 +1,74 @@ +# MiniCPM +In this directory, you will find examples on how you could use IPEX-LLM `optimize_model` API to accelerate MiniCPM models. For illustration purposes, we utilize the [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) as a reference MiniCPM model. + +## Requirements +To run these examples with IPEX-LLM, we have some recommended requirements for your machine, please refer to [here](../README.md#recommended-requirements) for more information. + +## Example: Predict Tokens using `generate()` API +In the example [generate.py](./generate.py), we show a basic use case for a MiniCPM model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. +### 1. Install +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). + +After installing conda, create a Python environment for IPEX-LLM: + +On Linux: + +```bash +conda create -n llm python=3.11 # recommend to use Python 3.11 +conda activate llm + +# install the latest ipex-llm nightly build with 'all' option +pip install --pre --upgrade ipex-llm[all] --extra-index-url https://download.pytorch.org/whl/cpu +``` + +On Windows: + +```cmd +conda create -n llm python=3.11 +conda activate llm + +pip install --pre --upgrade ipex-llm[all] +``` + +### 2. Run +``` +python ./generate.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH --prompt PROMPT --n-predict N_PREDICT +``` + +Arguments info: +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the MiniCPM model (e.g. `openbmb/MiniCPM-2B-sft-bf16`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'openbmb/MiniCPM-2B-sft-bf16'`. +- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is AI?'`. +- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. + +> **Note**: When loading the model in 4-bit, IPEX-LLM converts linear layers in the model into INT4 format. In theory, a *X*B model saved in 16-bit will requires approximately 2*X* GB of memory for loading, and ~0.5*X* GB memory for further inference. +> +> Please select the appropriate size of the MiniCPM model based on the capabilities of your machine. + +#### 2.1 Client +On client Windows machines, it is recommended to run directly with full utilization of all cores: +```cmd +python ./generate.py --prompt 'What is AI?' +``` + +#### 2.2 Server +For optimal performance on server, it is recommended to set several environment variables (refer to [here](../README.md#best-known-configuration-on-linux) for more information), and run the example with all the physical cores of a single socket. + +E.g. on Linux, +```bash +# set IPEX-LLM env variables +source ipex-llm-init + +# e.g. for a server with 48 cores per socket +export OMP_NUM_THREADS=48 +numactl -C 0-47 -m 0 python ./generate.py --prompt 'What is AI?' +``` + +#### 2.3 Sample Output +#### [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) +```log +Inference time: xxxx s +-------------------- Prompt -------------------- +<用户>what is AI? +-------------------- Output -------------------- + <用户>what is AI? AI, or Artificial Intelligence, refers to the simulation of human intelligence in machines that are programmed to think and learn like humans. It is a broad field of computer +``` diff --git a/python/llm/example/CPU/PyTorch-Models/Model/minicpm/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/minicpm/generate.py new file mode 100644 index 00000000000..50ea8ff61d0 --- /dev/null +++ b/python/llm/example/CPU/PyTorch-Models/Model/minicpm/generate.py @@ -0,0 +1,74 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch +import time +import argparse + +from transformers import AutoTokenizer, AutoModelForCausalLM +from ipex_llm import optimize_model + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for MiniCPM model') + parser.add_argument('--repo-id-or-model-path', type=str, default="openbmb/MiniCPM-2B-sft-bf16", + help='The huggingface repo id for the MiniCPM model to be downloaded' + ', or the path to the huggingface checkpoint folder') + parser.add_argument('--prompt', type=str, default="What is AI?", + help='Prompt to infer') + parser.add_argument('--n-predict', type=int, default=32, + help='Max tokens to predict') + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + + # Load model + model = AutoModelForCausalLM.from_pretrained(model_path, + trust_remote_code=True, + torch_dtype='auto', + low_cpu_mem_usage=True, + use_cache=True) + + # With only one line to enable IPEX-LLM optimization on model + model = optimize_model(model) + + # Load tokenizer + tokenizer = AutoTokenizer.from_pretrained(model_path, + trust_remote_code=True) + + # Generate predicted tokens + with torch.inference_mode(): + + # here the prompt formatting refers to: https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16/blob/79fbb1db171e6d8bf77cdb0a94076a43003abd9e/modeling_minicpm.py#L1320 + chat = [ + { "role": "user", "content": args.prompt }, + ] + prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False) + input_ids = tokenizer.encode(prompt, return_tensors="pt") + + # start inference + st = time.time() + + output = model.generate(input_ids, + do_sample=False, + max_new_tokens=args.n_predict) + end = time.time() + output_str = tokenizer.decode(output[0], skip_special_tokens=False) + print(f'Inference time: {end-st} s') + print('-'*20, 'Prompt', '-'*20) + print(prompt) + print('-'*20, 'Output', '-'*20) + print(output_str) diff --git a/python/llm/example/CPU/PyTorch-Models/Model/mistral/README.md b/python/llm/example/CPU/PyTorch-Models/Model/mistral/README.md index e058a716eec..c3b3227a837 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/mistral/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/mistral/README.md @@ -9,7 +9,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a Mistral model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/mixtral/README.md b/python/llm/example/CPU/PyTorch-Models/Model/mixtral/README.md index 6bbcc00841a..86253049875 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/mixtral/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/mixtral/README.md @@ -9,7 +9,7 @@ To run these examples with IPEX-LLM on Intel CPUs, we have some recommended requ ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a Mixtral model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations on Intel CPUs. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/phi-1_5/README.md b/python/llm/example/CPU/PyTorch-Models/Model/phi-1_5/README.md index 65be1ecae69..f006af538ce 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/phi-1_5/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/phi-1_5/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a phi-1_5 model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/phi-2/README.md b/python/llm/example/CPU/PyTorch-Models/Model/phi-2/README.md index 2320490d03f..515dc5e8c36 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/phi-2/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/phi-2/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a phi-2 model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: @@ -20,6 +20,7 @@ conda activate llm # install the latest ipex-llm nightly build with 'all' option pip install --pre --upgrade ipex-llm[all] --extra-index-url https://download.pytorch.org/whl/cpu pip install einops +pip install transformers==4.37.0 ``` On Windows: @@ -30,6 +31,7 @@ conda activate llm pip install --pre --upgrade ipex-llm[all] pip install einops +pip install transformers==4.37.0 ``` ### 2. Run diff --git a/python/llm/example/CPU/PyTorch-Models/Model/phi-3/README.md b/python/llm/example/CPU/PyTorch-Models/Model/phi-3/README.md index 66b9eac9beb..3cacedff2e3 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/phi-3/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/phi-3/README.md @@ -12,7 +12,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a phi-3 model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: @@ -73,6 +73,7 @@ In the example, several arguments can be passed to satisfy your requirements: #### 2.4 Sample Output #### [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) ```log +Inference time: xxxx s -------------------- Prompt -------------------- <|user|> What is AI?<|end|> diff --git a/python/llm/example/CPU/PyTorch-Models/Model/phixtral/README.md b/python/llm/example/CPU/PyTorch-Models/Model/phixtral/README.md index 3daadbadb7a..b1d1f0d8da6 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/phixtral/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/phixtral/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a phixtral model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/qwen-vl/README.md b/python/llm/example/CPU/PyTorch-Models/Model/qwen-vl/README.md index b28d49e60b4..25744465c26 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/qwen-vl/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/qwen-vl/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Multimodal chat using `chat()` API In the example [chat.py](./chat.py), we show a basic use case for a Qwen-VL model to start a multimodal chat using `chat()` API, with IPEX-LLM 'optimize_model' API. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/qwen1.5/README.md b/python/llm/example/CPU/PyTorch-Models/Model/qwen1.5/README.md index 7841702b92f..09ce24ec3e3 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/qwen1.5/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/qwen1.5/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a Qwen1.5 model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/qwen2/README.md b/python/llm/example/CPU/PyTorch-Models/Model/qwen2/README.md new file mode 100644 index 00000000000..84d7cc1293f --- /dev/null +++ b/python/llm/example/CPU/PyTorch-Models/Model/qwen2/README.md @@ -0,0 +1,84 @@ +# Qwen2 +In this directory, you will find examples on how you could use IPEX-LLM `optimize_model` API to accelerate Qwen2 models. For illustration purposes, we utilize the [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct) as reference Qwen2 model. + +## Requirements +To run these examples with IPEX-LLM, we have some recommended requirements for your machine, please refer to [here](../README.md#recommended-requirements) for more information. + +## Example: Predict Tokens using `generate()` API +In the example [generate.py](./generate.py), we show a basic use case for a Qwen2 model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. +### 1. Install +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). + +After installing conda, create a Python environment for IPEX-LLM: + +On Linux: + +```bash +conda create -n llm python=3.11 # recommend to use Python 3.11 +conda activate llm + +# install the latest ipex-llm nightly build with 'all' option +pip install --pre --upgrade ipex-llm[all] --extra-index-url https://download.pytorch.org/whl/cpu +pip install transformers==4.37.0 # install transformers which supports Qwen2 +``` + +On Windows: + +```cmd +conda create -n llm python=3.11 +conda activate llm + +pip install --pre --upgrade ipex-llm[all] +pip install transformers==4.37.0 +``` + +### 2. Run +``` +python ./generate.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH --prompt PROMPT --n-predict N_PREDICT +``` + +Arguments info: +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the Qwen2 to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'Qwen/Qwen2-7B-Instruct'`. +- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'AI是什么?'`. +- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. + +> **Note**: When loading the model in 4-bit, IPEX-LLM converts linear layers in the model into INT4 format. In theory, a *X*B model saved in 16-bit will requires approximately 2*X* GB of memory for loading, and ~0.5*X* GB memory for further inference. +> +> Please select the appropriate size of the Qwen model based on the capabilities of your machine. + +#### 2.1 Client +On client Windows machines, it is recommended to run directly with full utilization of all cores: +```cmd +python ./generate.py --prompt 'What is AI?' +``` + +#### 2.2 Server +For optimal performance on server, it is recommended to set several environment variables (refer to [here](../README.md#best-known-configuration-on-linux) for more information), and run the example with all the physical cores of a single socket. + +E.g. on Linux, +```bash +# set IPEX-LLM env variables +source ipex-llm-init + +# e.g. for a server with 48 cores per socket +export OMP_NUM_THREADS=48 +numactl -C 0-47 -m 0 python ./generate.py --prompt 'What is AI?' +``` + +#### 2.3 Sample Output +##### [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct) +```log +Inference time: xxxx s +-------------------- Prompt -------------------- +AI是什么? +-------------------- Output -------------------- +AI,即人工智能(Artificial Intelligence),是一门研究、开发用于模拟、延伸和扩展人的智能的理论、方法、技术及应用系统的学科 +``` + +```log +Inference time: xxxx s +-------------------- Prompt -------------------- +What is AI? +-------------------- Output -------------------- +AI stands for Artificial Intelligence, which refers to the development of computer systems that can perform tasks that typically require human intelligence. These tasks may include learning from experience, +``` diff --git a/python/llm/example/CPU/PyTorch-Models/Model/qwen2/generate.py b/python/llm/example/CPU/PyTorch-Models/Model/qwen2/generate.py new file mode 100644 index 00000000000..0c9b428abd6 --- /dev/null +++ b/python/llm/example/CPU/PyTorch-Models/Model/qwen2/generate.py @@ -0,0 +1,82 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch +import time +import argparse +import numpy as np + +from transformers import AutoTokenizer + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Qwen2-7B-Instruct') + parser.add_argument('--repo-id-or-model-path', type=str, default="Qwen/Qwen2-7B-Instruct", + help='The huggingface repo id for the Qwen2 model to be downloaded' + ', or the path to the huggingface checkpoint folder') + parser.add_argument('--prompt', type=str, default="AI是什么?", + help='Prompt to infer') + parser.add_argument('--n-predict', type=int, default=32, + help='Max tokens to predict') + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + + + from transformers import AutoModelForCausalLM + model = AutoModelForCausalLM.from_pretrained(model_path, + trust_remote_code=True, + torch_dtype='auto', + low_cpu_mem_usage=True, + use_cache=True) + + + # Load tokenizer + tokenizer = AutoTokenizer.from_pretrained(model_path, + trust_remote_code=True) + from ipex_llm import optimize_model + model = optimize_model(model) + + prompt = args.prompt + # Generate predicted tokens + with torch.inference_mode(): + # The following code for generation is adapted from https://huggingface.co/Qwen/Qwen2-7B-Instruct#quickstart + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": prompt} + ] + text = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True + ) + model_inputs = tokenizer([text], return_tensors="pt") + st = time.time() + generated_ids = model.generate( + model_inputs.input_ids, + max_new_tokens=args.n_predict + ) + end = time.time() + generated_ids = [ + output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) + ] + + response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] + print(f'Inference time: {end-st} s') + print('-'*20, 'Prompt', '-'*20) + print(prompt) + print('-'*20, 'Output', '-'*20) + print(response) diff --git a/python/llm/example/CPU/PyTorch-Models/Model/skywork/README.md b/python/llm/example/CPU/PyTorch-Models/Model/skywork/README.md index 71277e69ec3..65afec706bb 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/skywork/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/skywork/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a Skywork model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/solar/README.md b/python/llm/example/CPU/PyTorch-Models/Model/solar/README.md index 89bea91ec27..1d172795619 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/solar/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/solar/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a SOLAR model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/stablelm/README.md b/python/llm/example/CPU/PyTorch-Models/Model/stablelm/README.md index d2a44a255c1..3166340ff69 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/stablelm/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/stablelm/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a StableLM model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/wizardcoder-python/README.md b/python/llm/example/CPU/PyTorch-Models/Model/wizardcoder-python/README.md index 49f903c58be..2e266f5fa6f 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/wizardcoder-python/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/wizardcoder-python/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a WizardCoder-Python model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/yi/README.md b/python/llm/example/CPU/PyTorch-Models/Model/yi/README.md index c7eb8f27599..d96e5a13757 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/yi/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/yi/README.md @@ -8,7 +8,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a Yi model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/yuan2/README.md b/python/llm/example/CPU/PyTorch-Models/Model/yuan2/README.md index 403abc0548b..40737fce0e8 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/yuan2/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/yuan2/README.md @@ -9,7 +9,7 @@ In addition, you need to modify some files in Yuan2-2B-hf folder, since Flash at ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for an Yuan2 model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/PyTorch-Models/Model/ziya/README.md b/python/llm/example/CPU/PyTorch-Models/Model/ziya/README.md index ea43f9d3653..84544d5efb9 100644 --- a/python/llm/example/CPU/PyTorch-Models/Model/ziya/README.md +++ b/python/llm/example/CPU/PyTorch-Models/Model/ziya/README.md @@ -7,7 +7,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y ## Example: Predict Tokens using `generate()` API In the example [generate.py](./generate.py), we show a basic use case for a Ziya model to predict the next N tokens using `generate()` API, with IPEX-LLM 'optimize_model' API. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: diff --git a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py index c23622673b3..e7cd6eb296a 100644 --- a/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py +++ b/python/llm/example/CPU/QLoRA-FineTuning/alpaca-qlora/alpaca_qlora_finetuning_cpu.py @@ -39,7 +39,7 @@ from datasets import load_dataset import accelerate -from transformers import LlamaTokenizer +from transformers import AutoTokenizer from peft import ( get_peft_model_state_dict, set_peft_model_state_dict, @@ -198,13 +198,12 @@ def train( model = model.to("cpu") print(f"Model moved to rank {os.environ.get('LOCAL_RANK')}") - tokenizer = LlamaTokenizer.from_pretrained(base_model) + tokenizer = AutoTokenizer.from_pretrained(base_model) print(f"Tokenizer loaded on rank {os.environ.get('LOCAL_RANK')}") - tokenizer.pad_token_id = ( - 0 # unk. we want this to be different from the eos token - ) - tokenizer.padding_side = "left" # Allow batched inference + # For Llama family + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token print(model) diff --git a/python/llm/example/CPU/Speculative-Decoding/EAGLE/README.md b/python/llm/example/CPU/Speculative-Decoding/EAGLE/README.md index f51c9ac349b..da768bb99f1 100644 --- a/python/llm/example/CPU/Speculative-Decoding/EAGLE/README.md +++ b/python/llm/example/CPU/Speculative-Decoding/EAGLE/README.md @@ -8,7 +8,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y In this example, we run inference for a Llama2 model to showcase the speed of EAGLE with IPEX-LLM on MT-bench data on Intel CPUs. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: ```bash diff --git a/python/llm/example/CPU/vLLM-Serving/README.md b/python/llm/example/CPU/vLLM-Serving/README.md index 72a8e734fe0..a99c137e81f 100644 --- a/python/llm/example/CPU/vLLM-Serving/README.md +++ b/python/llm/example/CPU/vLLM-Serving/README.md @@ -28,6 +28,7 @@ pip3 install "pydantic<2" # Required for OpenAI server. # Install vllm git clone https://github.com/vllm-project/vllm.git && \ cd ./vllm && \ +git checkout v0.4.2 && \ pip install wheel packaging ninja setuptools>=49.4.0 numpy && \ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu && \ VLLM_TARGET_DEVICE=cpu python3 setup.py install diff --git a/python/llm/example/CPU/vLLM-Serving/offline_inference.py b/python/llm/example/CPU/vLLM-Serving/offline_inference.py index 142b80d4be3..63b74f7db3d 100644 --- a/python/llm/example/CPU/vLLM-Serving/offline_inference.py +++ b/python/llm/example/CPU/vLLM-Serving/offline_inference.py @@ -46,7 +46,7 @@ # Create an LLM. # llm = LLM(model="facebook/opt-125m") -llm = LLM(model="YOUR_MODEL_PATH", device="cpu", load_in_low_bit="sym_int4") +llm = LLM(model="YOUR_MODEL_PATH", device="cpu", load_in_low_bit="bf16") # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/benchmark.py b/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/benchmark.py index a5a268cfddb..0e76e798809 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/benchmark.py +++ b/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/benchmark.py @@ -247,7 +247,7 @@ def benchmark( ) args = parser.parse_args() PROMPT_LENGTH = args.prompt_length -PROMPT = open(f"prompt/{PROMPT_LENGTH}.txt", "r").read() +PROMPT = open(f"prompt/continuation/{PROMPT_LENGTH}.txt", "r").read() MAX_TOKENS = args.max_new_tokens diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md index a979d5f6051..372f0a1f1ad 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/README.md @@ -19,7 +19,7 @@ To run these examples with IPEX-LLM, we have some recommended requirements for y In the example [generate.py](./generate.py), we show a basic use case to load a GGUF LLaMA2 model into `ipex-llm` using `from_gguf()` API, with IPEX-LLM optimizations. ### 1. Install -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: ```bash diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codegemma/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codegemma/README.md index b0564824dda..96a0b804edb 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codegemma/README.md +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codegemma/README.md @@ -10,7 +10,7 @@ To run these examples with IPEX-LLM on Intel GPUs, we have some recommended requ In the example [generate.py](./generate.py), we show a basic use case for a CodeGemma model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations on Intel GPUs. ### 1. Install #### 1.1 Installation on Linux -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: ```bash diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codeshell/server.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codeshell/server.py index 9aeed3ba970..aeb94f3526c 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codeshell/server.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/codeshell/server.py @@ -274,9 +274,6 @@ def _get_args(): multi_turn = args.multi_turn max_context = args.max_context - if device == 'xpu': - import intel_extension_for_pytorch as ipex - model = model.to(device) model.generation_config = GenerationConfig.from_pretrained( diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py index 866f52f51dc..4359a06ea1b 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/internlm2/generate.py @@ -20,7 +20,6 @@ from transformers import AutoTokenizer from ipex_llm import optimize_model -import intel_extension_for_pytorch as ipex # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/internlm/internlm-chat-7b/blob/main/modeling_internlm.py#L1053 diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/minicpm/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/minicpm/README.md new file mode 100644 index 00000000000..45a213ba48e --- /dev/null +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/minicpm/README.md @@ -0,0 +1,123 @@ +# MiniCPM +In this directory, you will find examples on how you could apply IPEX-LLM INT4 optimizations on MiniCPM models on [Intel GPUs](../../../README.md). For illustration purposes, we utilize the [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) as a reference MiniCPM model. + +## 0. Requirements +To run these examples with IPEX-LLM on Intel GPUs, we have some recommended requirements for your machine, please refer to [here](../../../README.md#requirements) for more information. + +## Example: Predict Tokens using `generate()` API +In the example [generate.py](./generate.py), we show a basic use case for a MiniCPM model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations on Intel GPUs. +### 1. Install +#### 1.1 Installation on Linux +We suggest using conda to manage environment: +```bash +conda create -n llm python=3.11 +conda activate llm +# below command will install intel_extension_for_pytorch==2.1.10+xpu as default +pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +``` + +#### 1.2 Installation on Windows +We suggest using conda to manage environment: +```bash +conda create -n llm python=3.11 libuv +conda activate llm + +# below command will install intel_extension_for_pytorch==2.1.10+xpu as default +pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +``` + +### 2. Configures OneAPI environment variables for Linux + +> [!NOTE] +> Skip this step if you are running on Windows. + +This is a required step on Linux for APT or offline installed oneAPI. Skip this step for PIP-installed oneAPI. + +```bash +source /opt/intel/oneapi/setvars.sh +``` + +### 3. Runtime Configurations +For optimal performance, it is recommended to set several environment variables. Please check out the suggestions based on your device. +#### 3.1 Configurations for Linux +
+ +For Intel Arc™ A-Series Graphics and Intel Data Center GPU Flex Series + +```bash +export USE_XETLA=OFF +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_CACHE_PERSISTENT=1 +``` + +
+ +
+ +For Intel Data Center GPU Max Series + +```bash +export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_CACHE_PERSISTENT=1 +export ENABLE_SDP_FUSION=1 +``` +> Note: Please note that `libtcmalloc.so` can be installed by `conda install -c conda-forge -y gperftools=2.10`. +
+ +
+ +For Intel iGPU + +```bash +export SYCL_CACHE_PERSISTENT=1 +export BIGDL_LLM_XMX_DISABLED=1 +``` + +
+ +#### 3.2 Configurations for Windows +
+ +For Intel iGPU + +```cmd +set SYCL_CACHE_PERSISTENT=1 +set BIGDL_LLM_XMX_DISABLED=1 +``` + +
+ +
+ +For Intel Arc™ A-Series Graphics + +```cmd +set SYCL_CACHE_PERSISTENT=1 +``` + +
+ +> [!NOTE] +> For the first time that each model runs on Intel iGPU/Intel Arc™ A300-Series or Pro A60, it may take several minutes to compile. +### 4. Running examples + +``` +python ./generate.py --prompt 'What is AI?' +``` + +Arguments info: +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the MiniCPM model (e.g. `openbmb/MiniCPM-2B-sft-bf16`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'openbmb/MiniCPM-2B-sft-bf16'`. +- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is AI?'`. +- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. + +#### Sample Output +#### [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) + +```log +Inference time: xxxx s +-------------------- Prompt -------------------- +<用户>what is AI? +-------------------- Output -------------------- + <用户>what is AI? AI, or Artificial Intelligence, refers to the simulation of human intelligence in machines that are programmed to think and learn like humans. It is a field of computer science +``` diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/minicpm/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/minicpm/generate.py new file mode 100644 index 00000000000..669162e61a1 --- /dev/null +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/minicpm/generate.py @@ -0,0 +1,80 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch +import time +import argparse + +from ipex_llm.transformers import AutoModelForCausalLM +from transformers import AutoTokenizer + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for MiniCPM model') + parser.add_argument('--repo-id-or-model-path', type=str, default="openbmb/MiniCPM-2B-sft-bf16", + help='The huggingface repo id for the MiniCPM model to be downloaded' + ', or the path to the huggingface checkpoint folder') + parser.add_argument('--prompt', type=str, default="What is AI?", + help='Prompt to infer') + parser.add_argument('--n-predict', type=int, default=32, + help='Max tokens to predict') + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + + # Load model in 4 bit, + # which convert the relevant layers in the model into INT4 format + # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function. + # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. + model = AutoModelForCausalLM.from_pretrained(model_path, + load_in_4bit=True, + trust_remote_code=True, + optimize_model=True, + use_cache=True) + + model = model.to('xpu') + + # Load tokenizer + tokenizer = AutoTokenizer.from_pretrained(model_path, + trust_remote_code=True) + + # Generate predicted tokens + with torch.inference_mode(): + + # here the prompt formatting refers to: https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16/blob/79fbb1db171e6d8bf77cdb0a94076a43003abd9e/modeling_minicpm.py#L1320 + chat = [ + { "role": "user", "content": args.prompt }, + ] + prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False) + input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') + + # ipex_llm model needs a warmup, then inference time can be accurate + output = model.generate(input_ids, + max_new_tokens=args.n_predict) + # start inference + st = time.time() + + output = model.generate(input_ids, + do_sample=False, + max_new_tokens=args.n_predict) + torch.xpu.synchronize() + end = time.time() + output_str = tokenizer.decode(output[0], skip_special_tokens=False) + print(f'Inference time: {end-st} s') + print('-'*20, 'Prompt', '-'*20) + print(prompt) + print('-'*20, 'Output', '-'*20) + print(output_str) diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/README.md index d8c37adba81..87789eb083b 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/README.md +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phi-2/README.md @@ -16,6 +16,7 @@ conda activate llm pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ pip install einops # additional package required for phi-2 to conduct generation +pip install transformers==4.37.0 ``` #### 1.2 Installation on Windows @@ -28,6 +29,7 @@ conda activate llm pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ pip install einops # additional package required for phi-2 to conduct generation +pip install transformers==4.37.0 ``` ### 2. Configures OneAPI environment variables for Linux diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phixtral/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phixtral/generate.py index b6883b3245a..e4ab452b31f 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phixtral/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/phixtral/generate.py @@ -20,7 +20,6 @@ import numpy as np from transformers import AutoTokenizer, GenerationConfig -import intel_extension_for_pytorch as ipex # you could tune the prompt based on your own model, diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py index 902b917054c..ec037cc7e36 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen1.5/generate.py @@ -20,7 +20,6 @@ from transformers import AutoTokenizer from ipex_llm import optimize_model -import intel_extension_for_pytorch as ipex import numpy as np diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen2/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen2/README.md new file mode 100644 index 00000000000..dda7ba18461 --- /dev/null +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen2/README.md @@ -0,0 +1,134 @@ +# Qwen2 +In this directory, you will find examples on how you could apply IPEX-LLM INT4 optimizations on Qwen2 models on [Intel GPUs](../../../README.md). For illustration purposes, we utilize the [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct) as a reference InternLM model. + +## 0. Requirements +To run these examples with IPEX-LLM on Intel GPUs, we have some recommended requirements for your machine, please refer to [here](../../../README.md#requirements) for more information. + +## Example: Predict Tokens using `generate()` API +In the example [generate.py](./generate.py), we show a basic use case for a Qwen2 model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations on Intel GPUs. +### 1. Install +#### 1.1 Installation on Linux +We suggest using conda to manage environment: +```bash +conda create -n llm python=3.11 +conda activate llm +# below command will install intel_extension_for_pytorch==2.1.10+xpu as default +pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + +pip install transformers==4.37.0 # install transformers which supports Qwen2 +``` + +#### 1.2 Installation on Windows +We suggest using conda to manage environment: +```bash +conda create -n llm python=3.11 libuv +conda activate llm + +# below command will install intel_extension_for_pytorch==2.1.10+xpu as default +pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + +pip install transformers==4.37.0 # install transformers which supports Qwen2 +``` + +### 2. Configures OneAPI environment variables for Linux + +> [!NOTE] +> Skip this step if you are running on Windows. + +This is a required step on Linux for APT or offline installed oneAPI. Skip this step for PIP-installed oneAPI. + +```bash +source /opt/intel/oneapi/setvars.sh +``` + +### 3. Runtime Configurations +For optimal performance, it is recommended to set several environment variables. Please check out the suggestions based on your device. +#### 3.1 Configurations for Linux +
+ +For Intel Arc™ A-Series Graphics and Intel Data Center GPU Flex Series + +```bash +export USE_XETLA=OFF +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_CACHE_PERSISTENT=1 +``` + +
+ +
+ +For Intel Data Center GPU Max Series + +```bash +export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_CACHE_PERSISTENT=1 +export ENABLE_SDP_FUSION=1 +``` +> Note: Please note that `libtcmalloc.so` can be installed by `conda install -c conda-forge -y gperftools=2.10`. +
+ +
+ +For Intel iGPU + +```bash +export SYCL_CACHE_PERSISTENT=1 +export BIGDL_LLM_XMX_DISABLED=1 +``` + +
+ +#### 3.2 Configurations for Windows +
+ +For Intel iGPU + +```cmd +set SYCL_CACHE_PERSISTENT=1 +set BIGDL_LLM_XMX_DISABLED=1 +``` + +
+ +
+ +For Intel Arc™ A-Series Graphics + +```cmd +set SYCL_CACHE_PERSISTENT=1 +``` + +
+ +> [!NOTE] +> For the first time that each model runs on Intel iGPU/Intel Arc™ A300-Series or Pro A60, it may take several minutes to compile. +### 4. Running examples + +``` +python ./generate.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH --prompt PROMPT --n-predict N_PREDICT +``` + +Arguments info: +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the Qwen2 model (e.g. `Qwen/Qwen2-7B-Instruct`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'Qwen/Qwen2-7B-Instruct'`. +- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'AI是什么?'`. +- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. + +#### Sample Output +##### [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct) +```log +Inference time: xxxx s +-------------------- Prompt -------------------- +AI是什么? +-------------------- Output -------------------- +AI是人工智能(Artificial Intelligence)的缩写。它指的是由计算机系统表现出来的智能行为,这些行为通常包括学习、推理、问题解决 +``` + +```log +Inference time: xxxx s +-------------------- Prompt -------------------- +What is AI? +-------------------- Output -------------------- +AI, or Artificial Intelligence, refers to the simulation of human intelligence in machines that are programmed to think and learn like humans and mimic their actions. The term may +``` \ No newline at end of file diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen2/generate.py new file mode 100644 index 00000000000..25fdaeec16a --- /dev/null +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/qwen2/generate.py @@ -0,0 +1,92 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch +import time +import argparse + +from transformers import AutoTokenizer +from ipex_llm import optimize_model +import numpy as np + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Qwen2-7B-Instruct') + parser.add_argument('--repo-id-or-model-path', type=str, default="Qwen/Qwen2-7B-Instruct", + help='The huggingface repo id for the Qwen2 model to be downloaded' + ', or the path to the huggingface checkpoint folder') + parser.add_argument('--prompt', type=str, default="AI是什么?", + help='Prompt to infer') + parser.add_argument('--n-predict', type=int, default=32, + help='Max tokens to predict') + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + + + from ipex_llm.transformers import AutoModelForCausalLM + # Load model in 4 bit, + # which convert the relevant layers in the model into INT4 format + model = AutoModelForCausalLM.from_pretrained(model_path, + load_in_4bit=True, + optimize_model=True, + trust_remote_code=True, + use_cache=True) + model = model.to("xpu") + + # Load tokenizer + tokenizer = AutoTokenizer.from_pretrained(model_path, + trust_remote_code=True) + + prompt = args.prompt + + # Generate predicted tokens + with torch.inference_mode(): + # The following code for generation is adapted from https://huggingface.co/Qwen/Qwen2-7B-Instruct#quickstart + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": prompt} + ] + text = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True + ) + model_inputs = tokenizer([text], return_tensors="pt").to("xpu") + # warmup + generated_ids = model.generate( + model_inputs.input_ids, + max_new_tokens=args.n_predict + ) + + st = time.time() + generated_ids = model.generate( + model_inputs.input_ids, + max_new_tokens=args.n_predict + ) + torch.xpu.synchronize() + end = time.time() + generated_ids = generated_ids.cpu() + generated_ids = [ + output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) + ] + + response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] + print(f'Inference time: {end-st} s') + print('-'*20, 'Prompt', '-'*20) + print(prompt) + print('-'*20, 'Output', '-'*20) + print(response) \ No newline at end of file diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/solar/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/solar/generate.py index c2d90c91e86..eb5e4aa42cc 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/solar/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/solar/generate.py @@ -15,7 +15,6 @@ # import torch -import intel_extension_for_pytorch as ipex import time import argparse diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yuan2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yuan2/generate.py index 8b7a358e511..5cb29639c7b 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yuan2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Model/yuan2/generate.py @@ -16,7 +16,6 @@ import torch, transformers import sys, os, time -import intel_extension_for_pytorch as ipex import argparse from transformers import LlamaTokenizer from ipex_llm.transformers import AutoModelForCausalLM diff --git a/python/llm/example/GPU/LLM-Finetuning/LoRA/alpaca_lora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/LoRA/alpaca_lora_finetuning.py index d4b593d15e2..f215bde463e 100644 --- a/python/llm/example/GPU/LLM-Finetuning/LoRA/alpaca_lora_finetuning.py +++ b/python/llm/example/GPU/LLM-Finetuning/LoRA/alpaca_lora_finetuning.py @@ -180,10 +180,9 @@ def train( tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True) print(f"Tokenizer loaded on rank {os.environ.get('LOCAL_RANK')}") - tokenizer.pad_token_id = ( - 0 # unk. we want this to be different from the eos token - ) - tokenizer.padding_side = "left" # Allow batched inference + # For Llama family + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token print(model) diff --git a/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/alpaca_qalora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/alpaca_qalora_finetuning.py index 91271a780a2..e380d4948f6 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/alpaca_qalora_finetuning.py +++ b/python/llm/example/GPU/LLM-Finetuning/QA-LoRA/alpaca_qalora_finetuning.py @@ -192,11 +192,9 @@ def train( tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True) print(f"Tokenizer loaded on rank {os.environ.get('LOCAL_RANK')}") - tokenizer.pad_token_id = ( - 0 # unk. we want this to be different from the eos token - ) - tokenizer.padding_side = "left" # Allow batched inference - + # For Llama family + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token print(model) # Prepare a IPEX-LLM compatible Peft model diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py index ecd3d4da266..61916fff9b7 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py @@ -192,6 +192,9 @@ def train( tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True) print(f"Tokenizer loaded on rank {os.environ.get('LOCAL_RANK')}") + # For Llama family + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token print(model) # Prepare a IPEX-LLM compatible Peft model diff --git a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/README.md b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/README.md index e4233e37ee1..9bc8f254d44 100644 --- a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/README.md +++ b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/README.md @@ -1,4 +1,4 @@ -# Serve IPEX-LLM on Multiple Intel GPUs in multi-stage pipeline parallel fashion +# Serve IPEX-LLM on Multiple Intel GPUs in Multi-Stage Pipeline Parallel Fashion This example demonstrates how to run IPEX-LLM serving on multiple [Intel GPUs](../README.md) with Pipeline Parallel. diff --git a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/llama_models.py b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/llama_models.py index 4244a735ee9..29bac7a70b7 100644 --- a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/llama_models.py +++ b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/llama_models.py @@ -50,8 +50,10 @@ def __init__(self, config: LlamaConfig): self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size - self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) - self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + if self.pp_config.is_head: + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + if self.pp_config.is_tail: + self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) def get_input_embeddings(self): @@ -259,7 +261,6 @@ def forward( if self.pp_config.is_tail: hidden_states = outputs[0] logits = self.lm_head(hidden_states) - logits = logits.float() loss = None if labels is not None: diff --git a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_models.py b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_models.py index 8cf19793619..dfc09d6d57c 100644 --- a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_models.py +++ b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_models.py @@ -1,7 +1,6 @@ from torch import nn import torch import torch.distributed as dist -import intel_extension_for_pytorch as ipex from typing import List, Optional, Tuple, Union, Iterator import time diff --git a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py index a73b5294660..6b85a611f31 100644 --- a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py +++ b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py @@ -2,7 +2,6 @@ import torch.nn.parallel import torch.distributed as dist import os -import intel_extension_for_pytorch as ipex import oneccl_bindings_for_pytorch diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/README.md b/python/llm/example/GPU/Pipeline-Parallel-Inference/README.md index 8974afddc45..1f51c5f9741 100644 --- a/python/llm/example/GPU/Pipeline-Parallel-Inference/README.md +++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/README.md @@ -1,55 +1,70 @@ -# Run IPEX-LLM on Multiple Intel GPUs in pipeline parallel fashion +# Run IPEX-LLM on Multiple Intel GPUs in Pipeline Parallel Fashion -This example demonstrates how to run IPEX-LLM optimized low-bit model vertically partitioned on two [Intel GPUs](../README.md). +This example demonstrates how to run IPEX-LLM optimized low-bit model vertically partitioned on multiple [Intel GPUs](../README.md) for Linux users. ## Requirements To run this example with IPEX-LLM on Intel GPUs, we have some recommended requirements for your machine, please refer to [here](../README.md#recommended-requirements) for more information. For this particular example, you will need at least two GPUs on your machine. -## Example: +> [!NOTE] +> To run IPEX-LLM on multiple Intel GPUs in pipeline parallel fashion, you will need to install **Intel® oneAPI Base Toolkit 2024.1**, which could be done through an offline installer: +> ```bash +> wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/fdc7a2bc-b7a8-47eb-8876-de6201297144/l_BaseKit_p_2024.1.0.596_offline.sh +> +> sudo sh ./l_BaseKit_p_2024.1.0.596_offline.sh +> ``` -### 1.1 Install IPEX-LLM +## Example: Run pipeline parallel inference on multiple GPUs + +### 1. Installation ```bash conda create -n llm python=3.11 conda activate llm -# below command will install intel_extension_for_pytorch==2.1.10+xpu as default -# you can install specific ipex/torch version for your need -pip install --pre --upgrade ipex-llm[xpu_2.1] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ -# configures OneAPI environment variables -source /opt/intel/oneapi/setvars.sh -conda install -c conda-forge -y gperftools=2.10 # to enable tcmalloc +pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +pip install torch==2.1.0.post2 torchvision==0.16.0.post2 torchaudio==2.1.0.post2 intel-extension-for-pytorch==2.1.30+xpu oneccl_bind_pt==2.1.300+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ ``` -### 1.2 Build and install patched version of Intel Extension for PyTorch (IPEX) +### 2. Configures OneAPI environment variables ```bash -conda activate llm source /opt/intel/oneapi/setvars.sh -git clone https://github.com/intel/intel-extension-for-pytorch.git -cd intel-extension-for-pytorch -git checkout v2.1.10+xpu -git submodule update --init --recursive -git cherry-pick be8ea24078d8a271e53d2946ac533383f7a2aa78 -export USE_AOT_DEVLIST='ats-m150,pvc' -python setup.py install ``` +> [!NOTE] +> Please make sure you configure the environment variables for **Intel® oneAPI Base Toolkit's version == 2024.1.**. -> **Important**: IPEX 2.1.10+xpu requires Intel® oneAPI Base Toolkit's version == 2024.0. Please make sure you have installed the correct version. +### 3 Runtime Configurations -### 2. Run pipeline parallel inference on multiple GPUs -Here, we provide example usages on different models and different hardwares. Please refer to the appropriate script based on your model and device: +For optimal performance, it is recommended to set several environment variables. Please check out the suggestions based on your device. -### 3. Run +
-For optimal performance on Arc, it is recommended to set several environment variables. +For Intel Arc™ A-Series Graphics and Intel Data Center GPU Flex Series ```bash export USE_XETLA=OFF export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_CACHE_PERSISTENT=1 +``` + +
+ +
+ +For Intel Data Center GPU Max Series + +```bash +export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_CACHE_PERSISTENT=1 +export ENABLE_SDP_FUSION=1 ``` +> [!NOTE] +> Please note that `libtcmalloc.so` can be installed by `conda install -c conda-forge -y gperftools=2.10`. +
+### 4. Running examples ``` python ./generate.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH --prompt PROMPT --n-predict N_PREDICT --gpu-num GPU_NUM ``` @@ -61,7 +76,7 @@ Arguments info: - `--gpu-num GPU_NUM`: argument defining the number of GPU to use. It is default to be `2`. #### Sample Output -#### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) +##### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) ```log Inference time: xxxx s -------------------- Prompt -------------------- diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py b/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py index 84f625a1776..ae3cedb10ca 100644 --- a/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py +++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py @@ -16,7 +16,6 @@ # import torch -import intel_extension_for_pytorch as ipex import time import argparse diff --git a/python/llm/example/GPU/PyTorch-Models/Model/codegemma/README.md b/python/llm/example/GPU/PyTorch-Models/Model/codegemma/README.md index df37bf837e8..1c145defdff 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/codegemma/README.md +++ b/python/llm/example/GPU/PyTorch-Models/Model/codegemma/README.md @@ -10,7 +10,7 @@ To run these examples with IPEX-LLM on Intel GPUs, we have some recommended requ In the example [generate.py](./generate.py), we show a basic use case for a CodeGemma model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations on Intel GPUs. ### 1. Install #### 1.1 Installation on Linux -We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://docs.conda.io/en/latest/miniconda.html#). +We suggest using conda to manage the Python environment. For more information about conda installation, please refer to [here](https://conda-forge.org/download/). After installing conda, create a Python environment for IPEX-LLM: ```bash diff --git a/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py index 4f03aa79557..516903171b1 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py @@ -20,7 +20,6 @@ from transformers import AutoTokenizer from ipex_llm import optimize_model -import intel_extension_for_pytorch as ipex # you could tune the prompt based on your own model, # here the prompt tuning refers to https://huggingface.co/internlm/internlm-chat-7b/blob/main/modeling_internlm.py#L1053 diff --git a/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py index 6ec314b2fd5..5bee30df810 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/mamba/generate.py @@ -17,7 +17,6 @@ import argparse import time import torch -import intel_extension_for_pytorch as ipex from ipex_llm import optimize_model from transformers import AutoTokenizer diff --git a/python/llm/example/GPU/PyTorch-Models/Model/minicpm/README.md b/python/llm/example/GPU/PyTorch-Models/Model/minicpm/README.md new file mode 100644 index 00000000000..bce3215ef91 --- /dev/null +++ b/python/llm/example/GPU/PyTorch-Models/Model/minicpm/README.md @@ -0,0 +1,123 @@ +# MiniCPM +In this directory, you will find examples on how you could use IPEX-LLM `optimize_model` API to accelerate MiniCPM models on [Intel GPUs](../../../README.md). For illustration purposes, we utilize the [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) as a reference MiniCPM model. + +## 0. Requirements +To run these examples with IPEX-LLM on Intel GPUs, we have some recommended requirements for your machine, please refer to [here](../../../README.md#requirements) for more information. + +## Example: Predict Tokens using `generate()` API +In the example [generate.py](./generate.py), we show a basic use case for a MiniCPM model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations on Intel GPUs. +### 1. Install +#### 1.1 Installation on Linux +We suggest using conda to manage environment: +```bash +conda create -n llm python=3.11 +conda activate llm +# below command will install intel_extension_for_pytorch==2.1.10+xpu as default +pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +``` + +#### 1.2 Installation on Windows +We suggest using conda to manage environment: +```bash +conda create -n llm python=3.11 libuv +conda activate llm + +# below command will install intel_extension_for_pytorch==2.1.10+xpu as default +pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +``` + +### 2. Configures OneAPI environment variables for Linux + +> [!NOTE] +> Skip this step if you are running on Windows. + +This is a required step on Linux for APT or offline installed oneAPI. Skip this step for PIP-installed oneAPI. + +```bash +source /opt/intel/oneapi/setvars.sh +``` + +### 3. Runtime Configurations +For optimal performance, it is recommended to set several environment variables. Please check out the suggestions based on your device. +#### 3.1 Configurations for Linux +
+ +For Intel Arc™ A-Series Graphics and Intel Data Center GPU Flex Series + +```bash +export USE_XETLA=OFF +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_CACHE_PERSISTENT=1 +``` + +
+ +
+ +For Intel Data Center GPU Max Series + +```bash +export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_CACHE_PERSISTENT=1 +export ENABLE_SDP_FUSION=1 +``` +> Note: Please note that `libtcmalloc.so` can be installed by `conda install -c conda-forge -y gperftools=2.10`. +
+ +
+ +For Intel iGPU + +```bash +export SYCL_CACHE_PERSISTENT=1 +export BIGDL_LLM_XMX_DISABLED=1 +``` + +
+ +#### 3.2 Configurations for Windows +
+ +For Intel iGPU + +```cmd +set SYCL_CACHE_PERSISTENT=1 +set BIGDL_LLM_XMX_DISABLED=1 +``` + +
+ +
+ +For Intel Arc™ A-Series Graphics + +```cmd +set SYCL_CACHE_PERSISTENT=1 +``` + +
+ +> [!NOTE] +> For the first time that each model runs on Intel iGPU/Intel Arc™ A300-Series or Pro A60, it may take several minutes to compile. +### 4. Running examples + +``` +python ./generate.py --prompt 'What is AI?' +``` + +Arguments info: +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the MiniCPM model (e.g. `openbmb/MiniCPM-2B-sft-bf16`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'openbmb/MiniCPM-2B-sft-bf16'`. +- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is AI?'`. +- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. + +#### Sample Output +#### [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) + +```log +Inference time: xxxx s +-------------------- Prompt -------------------- +<用户>what is AI? +-------------------- Output -------------------- + <用户>what is AI? AI, or Artificial Intelligence, refers to the simulation of human intelligence in machines that are programmed to think and learn like humans. It is a field of computer science +``` diff --git a/python/llm/example/GPU/PyTorch-Models/Model/minicpm/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/minicpm/generate.py new file mode 100644 index 00000000000..b6f9a4cf3ca --- /dev/null +++ b/python/llm/example/GPU/PyTorch-Models/Model/minicpm/generate.py @@ -0,0 +1,81 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch +import time +import argparse + +from transformers import AutoModelForCausalLM, AutoTokenizer +from ipex_llm import optimize_model + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for MiniCPM model') + parser.add_argument('--repo-id-or-model-path', type=str, default="openbmb/MiniCPM-2B-sft-bf16", + help='The huggingface repo id for the MiniCPM model to be downloaded' + ', or the path to the huggingface checkpoint folder') + parser.add_argument('--prompt', type=str, default="What is AI?", + help='Prompt to infer') + parser.add_argument('--n-predict', type=int, default=32, + help='Max tokens to predict') + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + + # Load model + model = AutoModelForCausalLM.from_pretrained(model_path, + trust_remote_code=True, + torch_dtype='auto', + low_cpu_mem_usage=True, + use_cache=True) + + # With only one line to enable IPEX-LLM optimization on model + # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function. + # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. + model = optimize_model(model) + model = model.to('xpu') + + # Load tokenizer + tokenizer = AutoTokenizer.from_pretrained(model_path, + trust_remote_code=True) + + # Generate predicted tokens + with torch.inference_mode(): + + # here the prompt formatting refers to: https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16/blob/79fbb1db171e6d8bf77cdb0a94076a43003abd9e/modeling_minicpm.py#L1320 + chat = [ + { "role": "user", "content": args.prompt }, + ] + prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False) + input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu') + + # ipex_llm model needs a warmup, then inference time can be accurate + output = model.generate(input_ids, + max_new_tokens=args.n_predict) + # start inference + st = time.time() + + output = model.generate(input_ids, + do_sample=False, + max_new_tokens=args.n_predict) + torch.xpu.synchronize() + end = time.time() + output_str = tokenizer.decode(output[0], skip_special_tokens=False) + print(f'Inference time: {end-st} s') + print('-'*20, 'Prompt', '-'*20) + print(prompt) + print('-'*20, 'Output', '-'*20) + print(output_str) diff --git a/python/llm/example/GPU/PyTorch-Models/Model/phi-2/README.md b/python/llm/example/GPU/PyTorch-Models/Model/phi-2/README.md index 0ae7e51bf7c..bbd276b9612 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/phi-2/README.md +++ b/python/llm/example/GPU/PyTorch-Models/Model/phi-2/README.md @@ -16,6 +16,7 @@ conda activate llm pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ pip install einops # additional package required for phi-2 to conduct generation +pip install transformers==4.37.0 ``` #### 1.2 Installation on Windows @@ -26,6 +27,7 @@ conda activate llm # below command will install intel_extension_for_pytorch==2.1.10+xpu as default pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +pip install transformers==4.37.0 ``` ### 2. Configures OneAPI environment variables for Linux diff --git a/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py index 0c8a278a3ea..ce8fe73ee03 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/phixtral/generate.py @@ -20,7 +20,6 @@ import numpy as np from transformers import AutoTokenizer, GenerationConfig -import intel_extension_for_pytorch as ipex from ipex_llm import optimize_model diff --git a/python/llm/example/GPU/PyTorch-Models/Model/qwen1.5/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/qwen1.5/generate.py index 8b81a4f53ae..880e448d9b5 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/qwen1.5/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/qwen1.5/generate.py @@ -20,7 +20,6 @@ from transformers import AutoTokenizer from ipex_llm import optimize_model -import intel_extension_for_pytorch as ipex import numpy as np diff --git a/python/llm/example/GPU/PyTorch-Models/Model/qwen2/README.md b/python/llm/example/GPU/PyTorch-Models/Model/qwen2/README.md new file mode 100644 index 00000000000..9a3e3e03504 --- /dev/null +++ b/python/llm/example/GPU/PyTorch-Models/Model/qwen2/README.md @@ -0,0 +1,134 @@ +# Qwen2 +In this directory, you will find examples on how you could use IPEX-LLM `optimize_model` API to accelerate Qwen2 models on [Intel GPUs](../../../README.md). For illustration purposes, we utilize the [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct) as a reference InternLM model. + +## 0. Requirements +To run these examples with IPEX-LLM on Intel GPUs, we have some recommended requirements for your machine, please refer to [here](../../../README.md#requirements) for more information. + +## Example: Predict Tokens using `generate()` API +In the example [generate.py](./generate.py), we show a basic use case for a Qwen2 model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations on Intel GPUs. +### 1. Install +#### 1.1 Installation on Linux +We suggest using conda to manage environment: +```bash +conda create -n llm python=3.11 +conda activate llm +# below command will install intel_extension_for_pytorch==2.1.10+xpu as default +pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + +pip install transformers==4.37.0 # install transformers which supports Qwen2 +``` + +#### 1.2 Installation on Windows +We suggest using conda to manage environment: +```bash +conda create -n llm python=3.11 libuv +conda activate llm + +# below command will install intel_extension_for_pytorch==2.1.10+xpu as default +pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + +pip install transformers==4.37.0 # install transformers which supports Qwen2 +``` + +### 2. Configures OneAPI environment variables for Linux + +> [!NOTE] +> Skip this step if you are running on Windows. + +This is a required step on Linux for APT or offline installed oneAPI. Skip this step for PIP-installed oneAPI. + +```bash +source /opt/intel/oneapi/setvars.sh +``` + +### 3. Runtime Configurations +For optimal performance, it is recommended to set several environment variables. Please check out the suggestions based on your device. +#### 3.1 Configurations for Linux +
+ +For Intel Arc™ A-Series Graphics and Intel Data Center GPU Flex Series + +```bash +export USE_XETLA=OFF +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_CACHE_PERSISTENT=1 +``` + +
+ +
+ +For Intel Data Center GPU Max Series + +```bash +export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_CACHE_PERSISTENT=1 +export ENABLE_SDP_FUSION=1 +``` +> Note: Please note that `libtcmalloc.so` can be installed by `conda install -c conda-forge -y gperftools=2.10`. +
+ +
+ +For Intel iGPU + +```bash +export SYCL_CACHE_PERSISTENT=1 +export BIGDL_LLM_XMX_DISABLED=1 +``` + +
+ +#### 3.2 Configurations for Windows +
+ +For Intel iGPU + +```cmd +set SYCL_CACHE_PERSISTENT=1 +set BIGDL_LLM_XMX_DISABLED=1 +``` + +
+ +
+ +For Intel Arc™ A-Series Graphics + +```cmd +set SYCL_CACHE_PERSISTENT=1 +``` + +
+ +> [!NOTE] +> For the first time that each model runs on Intel iGPU/Intel Arc™ A300-Series or Pro A60, it may take several minutes to compile. +### 4. Running examples + +``` +python ./generate.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH --prompt PROMPT --n-predict N_PREDICT +``` + +Arguments info: +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the Qwen2 model (e.g. `Qwen/Qwen2-7B-Instruct`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'Qwen/Qwen2-7B-Instruct'`. +- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'AI是什么?'`. +- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. + +#### Sample Output +##### [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct) +```log +Inference time: xxxx s +-------------------- Prompt -------------------- +AI是什么? +-------------------- Output -------------------- +AI是人工智能(Artificial Intelligence)的缩写。它指的是通过计算机程序、算法和模型来模拟、延伸和扩展人类智能的一门学科 +``` + +```log +Inference time: xxxx s +-------------------- Prompt -------------------- +What is AI? +-------------------- Output -------------------- +AI stands for Artificial Intelligence. It refers to the simulation of human intelligence in machines that are programmed to think and work like humans. This includes learning from experience, +``` \ No newline at end of file diff --git a/python/llm/example/GPU/PyTorch-Models/Model/qwen2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/qwen2/generate.py new file mode 100644 index 00000000000..c3c19253004 --- /dev/null +++ b/python/llm/example/GPU/PyTorch-Models/Model/qwen2/generate.py @@ -0,0 +1,91 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch +import time +import argparse + +from transformers import AutoTokenizer +from ipex_llm import optimize_model +import numpy as np + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Qwen2-7B-Instruct') + parser.add_argument('--repo-id-or-model-path', type=str, default="Qwen/Qwen2-7B-Instruct", + help='The huggingface repo id for the Qwen2 model to be downloaded' + ', or the path to the huggingface checkpoint folder') + parser.add_argument('--prompt', type=str, default="AI是什么?", + help='Prompt to infer') + parser.add_argument('--n-predict', type=int, default=32, + help='Max tokens to predict') + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + + + from transformers import AutoModelForCausalLM + from ipex_llm import optimize_model + model = AutoModelForCausalLM.from_pretrained(model_path, + trust_remote_code=True, + torch_dtype = 'auto', + low_cpu_mem_usage=True, + use_cache=True) + model = optimize_model(model) + model = model.to("xpu") + + # Load tokenizer + tokenizer = AutoTokenizer.from_pretrained(model_path, + trust_remote_code=True) + + prompt = args.prompt + # Generate predicted tokens + with torch.inference_mode(): + # The following code for generation is adapted from https://huggingface.co/Qwen/Qwen2-7B-Instruct#quickstart + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": prompt} + ] + text = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True + ) + model_inputs = tokenizer([text], return_tensors="pt").to("xpu") + # warmup + generated_ids = model.generate( + model_inputs.input_ids, + max_new_tokens=args.n_predict + ) + + st = time.time() + generated_ids = model.generate( + model_inputs.input_ids, + max_new_tokens=args.n_predict + ) + torch.xpu.synchronize() + end = time.time() + generated_ids = generated_ids.cpu() + generated_ids = [ + output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) + ] + + response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] + print(f'Inference time: {end-st} s') + print('-'*20, 'Prompt', '-'*20) + print(prompt) + print('-'*20, 'Output', '-'*20) + print(response) \ No newline at end of file diff --git a/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py index ee522a8ffa3..b612bc2fc7f 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py @@ -15,7 +15,6 @@ # import torch -import intel_extension_for_pytorch as ipex import time import argparse diff --git a/python/llm/example/GPU/PyTorch-Models/Model/yuan2/generate.py b/python/llm/example/GPU/PyTorch-Models/Model/yuan2/generate.py index 9a887c09151..30706cb5bf0 100644 --- a/python/llm/example/GPU/PyTorch-Models/Model/yuan2/generate.py +++ b/python/llm/example/GPU/PyTorch-Models/Model/yuan2/generate.py @@ -16,7 +16,6 @@ import torch, transformers import sys, os, time -import intel_extension_for_pytorch as ipex import argparse from transformers import LlamaTokenizer, AutoModelForCausalLM from ipex_llm import optimize_model diff --git a/python/llm/example/GPU/README.md b/python/llm/example/GPU/README.md index aa29a8a15c7..ab13bf95485 100644 --- a/python/llm/example/GPU/README.md +++ b/python/llm/example/GPU/README.md @@ -7,7 +7,9 @@ This folder contains examples of running IPEX-LLM on Intel GPU: - [LLM-Finetuning](LLM-Finetuning): running ***finetuning*** (such as LoRA, QLoRA, QA-LoRA, etc) using IPEX-LLM on Intel GPUs - [vLLM-Serving](vLLM-Serving): running ***vLLM*** serving framework on intel GPUs (with IPEX-LLM low-bit optimized models) - [Deepspeed-AutoTP](Deepspeed-AutoTP): running distributed inference using ***DeepSpeed AutoTP*** (with IPEX-LLM low-bit optimized models) on Intel GPUs -- [Deepspeed-AutoTP-FastApi](Deepspeed-AutoTP-FastApi): running distributed inference using ***DeepSpeed AutoTP*** and start serving with ***FastApi***(with IPEX-LLM low-bit optimized models) on Intel GPUs +- [Deepspeed-AutoTP-FastAPI](Deepspeed-AutoTP-FastAPI): running distributed inference using ***DeepSpeed AutoTP*** and start serving with ***FastAPI***(with IPEX-LLM low-bit optimized models) on Intel GPUs +- [Pipeline-Parallel-Inference](Pipeline-Parallel-Inference): running IPEX-LLM optimized low-bit model vertically partitioned on multiple Intel GPUs +- [Pipeline-Parallel-FastAPI](Pipeline-Parallel-FastAPI): running IPEX-LLM serving with **FastAPI** on multiple Intel GPUs in pipeline parallel fasion - [LangChain](LangChain): running ***LangChain*** applications on IPEX-LLM - [PyTorch-Models](PyTorch-Models): running any PyTorch model on IPEX-LLM (with "one-line code change") - [Speculative-Decoding](Speculative-Decoding): running any ***Hugging Face Transformers*** model with ***self-speculative decoding*** on Intel GPUs diff --git a/python/llm/example/GPU/Speculative-Decoding/EAGLE/evaluation/gen_ea_answer_llama2chat.py b/python/llm/example/GPU/Speculative-Decoding/EAGLE/evaluation/gen_ea_answer_llama2chat.py index 4b461652623..ffa9adfba0a 100644 --- a/python/llm/example/GPU/Speculative-Decoding/EAGLE/evaluation/gen_ea_answer_llama2chat.py +++ b/python/llm/example/GPU/Speculative-Decoding/EAGLE/evaluation/gen_ea_answer_llama2chat.py @@ -47,7 +47,6 @@ from eagle.model.utils import * from eagle.model.kv_cache import initialize_past_key_values from eagle.model.choices import * -import intel_extension_for_pytorch as ipex from ipex_llm import optimize_model def ea_forward(input_ids, model, tokenizer, tree_choices, logits_processor=None, max_steps=512): diff --git a/python/llm/scripts/README.md b/python/llm/scripts/README.md index 20724652f2b..72cd9fe378c 100644 --- a/python/llm/scripts/README.md +++ b/python/llm/scripts/README.md @@ -17,7 +17,7 @@ sudo apt install xpu-smi ### Usage -* After installing `ipex-llm`, open a terminal (on Linux) or **Anaconda Prompt** (on Windows), and activate the conda environment you have created for running `ipex-llm`: +* After installing `ipex-llm`, open a terminal (on Linux) or **Miniforge Prompt** (on Windows), and activate the conda environment you have created for running `ipex-llm`: ``` conda activate llm ``` diff --git a/python/llm/setup.py b/python/llm/setup.py index fba5d17ebda..c6ae0581d9f 100644 --- a/python/llm/setup.py +++ b/python/llm/setup.py @@ -86,12 +86,7 @@ "quantize-llama_vnni.exe", "quantize-gptneox_vnni.exe", "quantize-bloom_vnni.exe", - "quantize-starcoder_vnni.exe", - - "main-chatglm_vnni.exe", - "chatglm_C.cp39-win_amd64.pyd", - "chatglm_C.cp310-win_amd64.pyd", - "chatglm_C.cp311-win_amd64.pyd" + "quantize-starcoder_vnni.exe" ] linux_binarys = [ "libllama_avx.so", @@ -125,13 +120,7 @@ "main-llama", "main-gptneox", "main-bloom", - "main-starcoder", - - "main-chatglm_vnni", - "main-chatglm_amx", - "chatglm_C.cpython-39-x86_64-linux-gnu.so", - "chatglm_C.cpython-310-x86_64-linux-gnu.so", - "chatglm_C.cpython-311-x86_64-linux-gnu.so" + "main-starcoder" ] ext_lib_urls = [ diff --git a/python/llm/src/ipex_llm/ggml/convert.py b/python/llm/src/ipex_llm/ggml/convert.py index cb4f6efbe72..dff8a43a205 100644 --- a/python/llm/src/ipex_llm/ggml/convert.py +++ b/python/llm/src/ipex_llm/ggml/convert.py @@ -76,10 +76,6 @@ def _convert_starcoder(model_path, outfile_dir, outtype): _convert_starcoder_hf_to_ggml(model_path, outfile_dir, outtype) -def _convert_chatglm(model_path, outfile_dir, outtype): - return _convert_chatglm_hf_to_ggml(model_path, outfile_dir, outtype) - - def _convert_to_ggml(model_path: str, outfile_dir: str, model_family: str = 'llama', outtype: str="fp16"): """ diff --git a/python/llm/src/ipex_llm/ggml/convert_model.py b/python/llm/src/ipex_llm/ggml/convert_model.py index 074a7d3537d..98c998add51 100644 --- a/python/llm/src/ipex_llm/ggml/convert_model.py +++ b/python/llm/src/ipex_llm/ggml/convert_model.py @@ -16,7 +16,7 @@ import os import time from pathlib import Path -from ipex_llm.ggml.convert import _convert_to_ggml, _convert_chatglm +from ipex_llm.ggml.convert import _convert_to_ggml from ipex_llm.ggml.quantize import quantize from ipex_llm.utils.common import invalidInputError import argparse @@ -54,9 +54,9 @@ def convert_model(input_path: str, # make sure directory exists os.makedirs(output_path, exist_ok=True) # check input value - invalidInputError(model_family in ['llama', 'bloom', 'gptneox', 'starcoder', 'chatglm'], + invalidInputError(model_family in ['llama', 'bloom', 'gptneox', 'starcoder'], "Now we only support quantization of model \ - family('llama', 'bloom', 'gptneox', 'starcoder', 'chatglm')", + family('llama', 'bloom', 'gptneox', 'starcoder')", "{} is not in the list.".format(model_family)) invalidInputError(os.path.isdir(output_path), "The output_path {} was not a directory".format(output_path)) @@ -78,12 +78,6 @@ def convert_model(input_path: str, family('llama', 'gptneox', 'starcoder')", "{} is not in the list.".format(model_family)) - # chatglm merges convertion and quantization into one operation. - if model_family == 'chatglm': - return _convert_chatglm(model_path=input_path, - outfile_dir=output_path, - outtype=dtype) - if tmp_path is not None: model_name = Path(input_path).stem tmp_ggml_file_path = os.path.join(tmp_path, f'{model_name}_{int(time.time())}') diff --git a/python/llm/src/ipex_llm/ggml/model/chatglm/__init__.py b/python/llm/src/ipex_llm/ggml/model/chatglm/__init__.py deleted file mode 100644 index ac7933c5384..00000000000 --- a/python/llm/src/ipex_llm/ggml/model/chatglm/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -# -# Copyright 2016 The BigDL Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# This would makes sure Python is aware there is more than one sub-package within bigdl, -# physically located elsewhere. -# Otherwise there would be module not found error in non-pip's setting as Python would -# only search the first bigdl package and end up finding only one sub-package. - -from .chatglm import ChatGLM diff --git a/python/llm/src/ipex_llm/ggml/model/chatglm/chatglm.py b/python/llm/src/ipex_llm/ggml/model/chatglm/chatglm.py deleted file mode 100644 index cd1efba6154..00000000000 --- a/python/llm/src/ipex_llm/ggml/model/chatglm/chatglm.py +++ /dev/null @@ -1,428 +0,0 @@ -# -# Copyright 2016 The BigDL Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# =========================================================================== -# -# This file is adapted from -# https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/llama.py -# -# MIT License -# -# Copyright (c) 2023 Andrei Betlen -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -# This would makes sure Python is aware there is more than one sub-package within bigdl, -# physically located elsewhere. -# Otherwise there would be module not found error in non-pip's setting as Python would -# only search the first bigdl package and end up finding only one sub-package. - - -from .chatglm_cpp import chatglm_load, chatglm_tokenize, chatglm_detokenize, \ - chatglm_forward, chatglm_eos_token -from ipex_llm.utils.common import invalidInputError -from ipex_llm.ggml.model.generation import GenerationMixin -from typing import List, Optional, Generator, Sequence, Union -import time -import uuid -import warnings - - -class ChatGLM(GenerationMixin): - """High-level Python wrapper for a chatglm.cpp model.""" - - def __init__( - self, - model_path: str, - n_ctx: int = 512, - n_parts: int = -1, - n_gpu_layers: int = 0, - seed: int = -1, - f16_kv: bool = True, - logits_all: bool = False, - vocab_only: bool = False, - use_mmap: bool = False, - use_mlock: bool = False, - embedding: bool = False, - n_threads: Optional[int] = -1, - n_batch: int = 512, - last_n_tokens_size: int = 64, - lora_base: Optional[str] = None, - lora_path: Optional[str] = None, - verbose: bool = True, - ): - """Load a chatglm.cpp model from `model_path`. - - Args: - model_path: Path to the model. - n_ctx: Maximum context size. - n_parts: Number of parts to split the model into. If -1, the number of parts - is automatically determined. - seed: Random seed. For default value -1, current timestamp is used as seed. - f16_kv: Use half-precision for key/value cache. - logits_all: Return logits for all tokens, not just the last token. - vocab_only: Only load the vocabulary no weights. - use_mmap: Use mmap if possible. - use_mlock: Force the system to keep the model in RAM. - embedding: Embedding mode only. - n_threads: Number of threads to use. Default to be -1, means auto. - n_batch: Maximum number of prompt tokens to batch together when calling chatglm_eval. - last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. - lora_base: Optional path to base model, useful if using a quantized base model and - you want to apply LoRA to an f16 model. - lora_path: Path to a LoRA file to apply to the model. - verbose: Print verbose output to stderr. - - Raises: - ValueError: If the model path does not exist. - - Returns: - A ChatGLM instance. - """ - - self.model_path = model_path - self.ctx = chatglm_load(model_path, use_mmap=use_mmap, n_ctx=n_ctx, n_threads=n_threads) - self.n_ctx = n_ctx - self.n_parts = n_parts - self.n_gpu_layers = n_gpu_layers - self.f16_kv = f16_kv - self.seed = seed - self.logits_all = logits_all - self.vocab_only = vocab_only - self.use_mmap = use_mmap - self.use_mlock = use_mlock - self.embedding = embedding - self.n_threads = n_threads - self.n_batch = n_batch - self.last_n_tokens_size = last_n_tokens_size - self.lora_base = lora_base - self.lora_path = lora_path - self.verbose = verbose - # TODO: Some parameters are temporarily not supported - unsupported_arg = {'n_parts': -1, 'n_gpu_layers': 0, 'f16_kv': True, 'logits_all': False, - 'vocab_only': False, 'use_mlock': False, 'embedding': False, - 'n_batch': 512, 'last_n_tokens_size': 64, 'lora_base': None, - 'lora_path': None, 'verbose': True} - for arg in unsupported_arg.keys(): - if getattr(self, arg) != unsupported_arg[arg]: - warnings.warn(f"The parameter {arg} is temporarily unsupported, " - "please use the default value.") - - def __call__( - self, - prompt: str, - suffix: Optional[str] = None, - max_tokens: int = 128, - temperature: float = 0.95, - top_p: float = 0.7, - logprobs: Optional[int] = None, - echo: bool = False, - stop: Optional[Union[str, List[str]]]=[], - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - repeat_penalty: float = 1.1, - top_k: int = 0, - stream: bool = False, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - model: Optional[str] = None, - ): - # TODO: Some parameters are temporarily not supported - # Unsupported parameters are checked in `_supported_call` - return self._supported_call(prompt, max_tokens, stream, temperature, top_p, top_k, - stop, model, suffix, logprobs, echo, frequency_penalty, - presence_penalty, repeat_penalty, tfs_z, mirostat_mode, - mirostat_tau, mirostat_eta) - - def _supported_call(self, prompt: str, max_tokens: int, stream: bool, - temperature: float, top_p: float, top_k: int, - stop: Optional[List[str]] = [], model: Optional[str] = None, *args): - # Check unsupporeted parameters - unsupported_arg = ['suffix', 'logprobs', 'echo', - 'frequency_penalty', 'presence_penalty', 'repeat_penalty', - 'tfs_z', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'model'] - defult_value = {'suffix': None, 'logprobs': None, 'echo': False, - 'frequency_penalty': 0.0, 'presence_penalty': 0.0, - 'repeat_penalty': 1.1, 'tfs_z': 1.0, 'mirostat_mode': 0, - 'mirostat_tau': 5.0, 'mirostat_eta': 0.1} - for index in range(len(args)): - if args[index] != defult_value[unsupported_arg[index]]: - warnings.warn(f"The parameter {unsupported_arg[index]} is temporarily " - "unsupported, please use the default value.") - - if stream: - return self.stream(prompt, max_tokens, temperature, top_p, top_k, stop, model) - else: - return self._eval(prompt, max_tokens, temperature, top_p, top_k, stop, model) - - def _eval(self, prompt: str, max_tokens: int, temperature: float, top_p: float, top_k: int, - stop: Optional[List[str]] = [], model: Optional[str] = None): - - completion_id: str = f"cmpl-{str(uuid.uuid4())}" - created: int = int(time.time()) - if model is None: - model_name = self.model_path - else: - model_name = model - - input_tokens = self._tokenize(prompt) - prompt_len = len(input_tokens) - if max_tokens < 1: - return { - "id": completion_id, - "object": "text_completion", - "created": created, - "model": model_name, - "choices": [ - { - "text": prompt, - "index": 0, - "logprobs": None, - "finish_reason": "length", - } - ], - "usage": - { - "prompt_tokens": prompt_len, - "completion_tokens": 0, - "total_tokens": prompt_len, - } - } - - for i in range(max_tokens): - token = self.forward(input_ids=input_tokens, - top_k=top_k, - top_p=top_p, - temperature=temperature) - input_tokens.append(token) - if token == self.eos_token(): - break - - text = self.detokenize(input_tokens) - split_text = text[len(prompt):] - split_text.rstrip('�') # remove partial emoji - if stop != []: - for stop_word in stop: - split_text = split_text.split(stop_word)[0] - if split_text != text: - finish_reason = "stop" - else: - finish_reason = None - completion_len = len(input_tokens) - prompt_len - return { - "id": completion_id, - "object": "text_completion", - "created": created, - "model": model_name, - "choices": [ - { - "text": prompt + split_text, - "index": 0, - "logprobs": None, - "finish_reason": finish_reason, - } - ], - "usage": { - "prompt_tokens": prompt_len, - "completion_tokens": completion_len, - "total_tokens": prompt_len + completion_len, - } - } - - def stream(self, prompt: str, max_tokens: int, temperature: float, top_p: float, top_k: int, - stop: Optional[List[str]] = [], model: Optional[str] = None): - completion_id: str = f"cmpl-{str(uuid.uuid4())}" - created: int = int(time.time()) - if model is None: - model_name = self.model_path - else: - model_name = model - input_tokens = self._tokenize(prompt) - prompt_len = len(input_tokens) - if max_tokens < 1: - yield { - "id": completion_id, - "object": "text_completion", - "created": created, - "model": model_name, - "choices": [ - { - "text": prompt, - "index": 0, - "logprobs": None, - "finish_reason": "length", - } - ], - "usage": { - "prompt_tokens": prompt_len - } - } - else: - history_text = prompt - for i in range(max_tokens): - token = self.forward(input_ids=input_tokens, - top_k=top_k, - top_p=top_p, - temperature=temperature) - input_tokens.append(token) - if token == self.eos_token(): - print('\n') - break - text = self.detokenize(input_tokens) - if text.endswith('�'): - # generated new token is part of an emoji - # (some emoji consists of multiple tokens) - # continue to generate more tokens to decode this emoji - continue - text = text[len(history_text):] - history_text += text - yield { - "id": completion_id, - "object": "text_completion", - "created": created, - "model": model_name, - "choices": [ - { - "text": text, - "index": 0, - "logprobs": None, - "finish_reason": None, - } - ], - "usage": { - "prompt_tokens": prompt_len - } - } - - def _tokenize(self, text: str, *args) -> List[int]: - """Tokenize a string. - - Args: - text: The string to tokenize. - - Raises: - RuntimeError: If the tokenization failed. - - Returns: - A list of tokens. - """ - warnings.warn("The parameter `add_bos` is unsupported, please use the default value.") - return chatglm_tokenize(self.ctx, text) - - def detokenize(self, tokens: List[int]) -> str: - """Detokenize a list of tokens. - - Args: - tokens: The list of tokens to detokenize. - - Returns: - The detokenized string. - """ - if isinstance(tokens, int): - tokens = [tokens] - return chatglm_detokenize(self.ctx, tokens) - - def forward(self, - input_ids: List[int], - do_sample: bool = True, - top_k: int = 0, - top_p: float = 0.7, - temperature: float = 0.95,) -> int: - return chatglm_forward(ctx=self.ctx, - input_ids=input_ids, - do_sample=do_sample, - top_k=top_k, - top_p=top_p, - temperature=temperature) - - def eos_token(self) -> int: - return chatglm_eos_token(self.ctx) - - def _generate( - self, - tokens: Sequence[int], - top_k: int = 0, - top_p: float = 0.7, - temp: float = 0.95, - repeat_penalty: float = 1.1, - reset: bool = True, - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - ) -> Generator[int, Optional[Sequence[int]], None]: - """Create a generator of tokens from a prompt. - - Examples: - >>> llm = ChatGLM(your_model_path) - >>> tokens = llm._tokenize(b"Learning English is") - >>> for token in llm._generate(tokens): - >>> print(llm.detokenize([token]).decode("utf-8", errors="ignore")) - - Args: - tokens: The prompt tokens. - - Yields: - The generated tokens. - """ - # TODO: Some parameters are temporarily not supported - # Unsupported parameters are checked in `_supported_generate` - return self._supported_generate(tokens, top_k, top_p, temp, repeat_penalty, reset, - frequency_penalty, presence_penalty, tfs_z, mirostat_mode, - mirostat_tau, mirostat_eta) - - def _supported_generate(self, tokens: Sequence[int], top_k: int = 0, top_p: float = 0.7, - temp: float = 0.95, *args): - # Check unsupporeted parameters - unsupported_arg = ['repeat_penalty', 'reset', 'frequency_penalty', 'presence_penalty', - 'tfs_z', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta'] - defult_value = {'repeat_penalty': 1.1, 'reset': True, 'frequency_penalty': 0.0, - 'presence_penalty': 0.0, 'tfs_z': 1.0, 'mirostat_mode': 0, - 'mirostat_tau': 5.0, 'mirostat_eta': 0.1} - for index in range(len(args)): - if args[index] != defult_value[unsupported_arg[index]]: - warnings.warn(f"The parameter {unsupported_arg[index]} is temporarily " - "unsupported, please use the default value.") - - invalidInputError(self.ctx is not None, "The attribute `ctx` of `ChatGLM` object is None.") - while True: - token = self.forward(input_ids=tokens, - top_k=top_k, - top_p=top_p, - temperature=temp) - tokens_or_none = yield token - tokens.append(token) - if tokens_or_none is not None: - tokens.extend(tokens_or_none) diff --git a/python/llm/src/ipex_llm/ggml/model/chatglm/chatglm_cpp.py b/python/llm/src/ipex_llm/ggml/model/chatglm/chatglm_cpp.py deleted file mode 100644 index ce136cdb682..00000000000 --- a/python/llm/src/ipex_llm/ggml/model/chatglm/chatglm_cpp.py +++ /dev/null @@ -1,72 +0,0 @@ -# -# Copyright 2016 The BigDL Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# This would makes sure Python is aware there is more than one sub-package within bigdl, -# physically located elsewhere. -# Otherwise there would be module not found error in non-pip's setting as Python would -# only search the first bigdl package and end up finding only one sub-package. - - -from typing import List -from pathlib import Path - -from ipex_llm.libs.chatglm_C import Pipeline, GenerationConfig - - -class ChatGLMContext: - def __init__(self, pipeline: Pipeline, config: GenerationConfig): - self.pipeline = pipeline - self.config = config - - -def chatglm_load(path: str, - n_ctx: int, - n_threads: int, - use_mmap: bool = False, - ) -> ChatGLMContext: - path = str(Path(path)) - pipeline = Pipeline(path, use_mmap) - config = GenerationConfig( - max_length=n_ctx, - num_threads=n_threads, - ) - return ChatGLMContext(pipeline, config) - - -def chatglm_tokenize(ctx: ChatGLMContext, prompt: str) -> List[int]: - return ctx.pipeline.tokenizer.encode(prompt) - - -def chatglm_detokenize(ctx: ChatGLMContext, input_ids: List[int]) -> str: - return ctx.pipeline.tokenizer.decode(input_ids) - - -def chatglm_forward(ctx: ChatGLMContext, - input_ids: List[int], - do_sample: bool = True, - top_k: int = 0, - top_p: float = 0.7, - temperature: float = 0.95, - ) -> int: - ctx.config.do_sample = do_sample - ctx.config.top_k = top_k - ctx.config.top_p = top_p - ctx.config.temperature = temperature - return ctx.pipeline.forward(input_ids, ctx.config) - - -def chatglm_eos_token(ctx: ChatGLMContext): - return ctx.pipeline.model.config.eos_token_id diff --git a/python/llm/src/ipex_llm/ggml/quantize.py b/python/llm/src/ipex_llm/ggml/quantize.py index 3eaf668e58e..8388fc3bede 100644 --- a/python/llm/src/ipex_llm/ggml/quantize.py +++ b/python/llm/src/ipex_llm/ggml/quantize.py @@ -48,7 +48,9 @@ "q6_k": 26, "q4_k": 27, "q5_k": 28, - "fp6": 29} + "fp6": 29, + "fp6_k": 30, + } # mixed precison from llama.cpp gguf_mixed_qtype = {"gguf_q4k_s": 101, diff --git a/python/llm/src/ipex_llm/langchain/llms/__init__.py b/python/llm/src/ipex_llm/langchain/llms/__init__.py index b4c2e7b005b..00b5ecb225e 100644 --- a/python/llm/src/ipex_llm/langchain/llms/__init__.py +++ b/python/llm/src/ipex_llm/langchain/llms/__init__.py @@ -32,7 +32,6 @@ "LlamaLLM", "BloomLLM", "GptneoxLLM", - "ChatGLMLLM", "StarcoderLLM", "TransformersLLM", "TransformersPipelineLLM" @@ -43,7 +42,6 @@ "LlamaLLM": LlamaLLM, "BloomLLM": BloomLLM, "GptneoxLLM": GptneoxLLM, - "ChatGLMLLM": ChatGLMLLM, "StarcoderLLM": StarcoderLLM, "TransformersPipelineLLM": TransformersPipelineLLM, "TransformersLLM": TransformersLLM diff --git a/python/llm/src/ipex_llm/langchain/llms/bigdlllm.py b/python/llm/src/ipex_llm/langchain/llms/bigdlllm.py index afd51780579..3e7fc3eadf1 100644 --- a/python/llm/src/ipex_llm/langchain/llms/bigdlllm.py +++ b/python/llm/src/ipex_llm/langchain/llms/bigdlllm.py @@ -70,14 +70,13 @@ class BigdlNativeLLM(LLM): "please switch to the new LLM API for sepcific models.") model_family: str = "llama" - """The model family: currently supports llama, gptneox, bloom, starcoder and chatglm.""" + """The model family: currently supports llama, gptneox, bloom, starcoder.""" family_info = { 'llama': {'module': "ipex_llm.models" , 'class': "Llama"}, 'bloom': {'module': "ipex_llm.models", 'class': "Bloom"}, 'gptneox': {'module': "ipex_llm.models", 'class': "Gptneox"}, 'starcoder': {'module':"ipex_llm.models", 'class': "Starcoder"}, - 'chatglm': {'module':"ipex_llm.ggml.model.chatglm", 'class': "ChatGLM"}, } #: :meta private: """Info necessary for different model families initiation and configure.""" @@ -688,11 +687,6 @@ class GptneoxLLM(_BaseCausalLM): ggml_module = "ipex_llm.models" -class ChatGLMLLM(_BaseCausalLM): - ggml_model = "ChatGLM" - ggml_module = "ipex_llm.ggml.model.chatglm" - - class StarcoderLLM(_BaseCausalLM): ggml_model = "Starcoder" ggml_module = "ipex_llm.models" diff --git a/python/llm/src/ipex_llm/models.py b/python/llm/src/ipex_llm/models.py index 9157af42b7f..d982307d60e 100644 --- a/python/llm/src/ipex_llm/models.py +++ b/python/llm/src/ipex_llm/models.py @@ -23,5 +23,3 @@ from ipex_llm.ggml.model.gptneox import Gptneox from ipex_llm.ggml.model.bloom import Bloom from ipex_llm.ggml.model.starcoder import Starcoder -# temporarily disable until linux binary file for chatglm ready -# from ipex_llm.ggml.model.chatglm import ChatGLM diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py index 04efa8ae4a2..29e7c7cd0e2 100644 --- a/python/llm/src/ipex_llm/transformers/convert.py +++ b/python/llm/src/ipex_llm/transformers/convert.py @@ -54,6 +54,7 @@ _IS_VLLM_AVAILABLE = None _USE_VLLM = False +_VLLM_VERSION = None def is_auto_gptq_available(): @@ -77,6 +78,14 @@ def is_vllm_available(): return _IS_VLLM_AVAILABLE +def get_package_version(package_name): + result = subprocess.run(['pip', 'list'], capture_output=True, text=True) + for line in result.stdout.splitlines(): + if line.startswith(package_name): + return line.split()[1] + return None + + def get_use_vllm(): return _USE_VLLM @@ -133,13 +142,24 @@ def is_linear_module(module): is_awq = is_auto_awq_available() and isinstance(module, WQLinear_GEMM) if is_vllm_available(): # Only convert vllm modules + global _VLLM_VERSION + if _VLLM_VERSION is None: + _VLLM_VERSION = get_package_version('vllm') + if 'xpu' in _VLLM_VERSION: + # For vllm xpu + from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_group, + get_tensor_model_parallel_world_size + ) + tp_size = get_tensor_model_parallel_world_size() + else: + # For vllm cpu + tp_size = 1 + from vllm.model_executor.layers.linear import ( ColumnParallelLinear, RowParallelLinear, QKVParallelLinear, MergedColumnParallelLinear ) - from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_group, - get_tensor_model_parallel_world_size - ) + VLLM_LINEAR_LIST = [ ColumnParallelLinear, RowParallelLinear, QKVParallelLinear, MergedColumnParallelLinear ] @@ -148,7 +168,6 @@ def is_linear_module(module): out_features = module.output_size result = True mp_group = None - tp_size = get_tensor_model_parallel_world_size() if isinstance(module, RowParallelLinear) and tp_size >= 2: mp_group = get_tensor_model_parallel_group() in_features = module.input_size_per_partition @@ -661,41 +680,15 @@ def _optimize_pre(model): if model.lm_head.weight.data.device != "meta": norm_weight = nn.functional.normalize(lm_head_weight_data) model.lm_head.weight.data = norm_weight + + # for baichuan2-7B + if model.config.hidden_size in [4096, 2048]: + from ipex_llm.transformers.models.baichuan import pre_compute_inv_freq + model.apply(pre_compute_inv_freq) # for yuan 2.0 if model.config.model_type == "yuan": - def merge_qk_proj_func(module): - if "YuanAttention" in module.__class__.__name__: - q_weight = module.q_proj.weight.data - k_weight = module.k_proj.weight.data - num_heads = module.num_heads - head_dim = module.head_dim - hidden_size = module.hidden_size - - weight_q = torch.cat([ - q_weight.view(num_heads, head_dim, hidden_size)[0::2, :, :], - k_weight.view(num_heads, head_dim, hidden_size)[0::2, :, :], - ], dim=0).view(num_heads * head_dim, hidden_size) - - weight_k = torch.cat([ - q_weight.view(num_heads, head_dim, hidden_size)[1::2, :, :], - k_weight.view(num_heads, head_dim, hidden_size)[1::2, :, :], - ], dim=0).view(num_heads * head_dim, hidden_size) - - merged_q_proj = torch.nn.Linear(0, 0, False) - merged_q_proj.weight = torch.nn.Parameter(weight_q, requires_grad=False) - merged_q_proj.in_features = hidden_size - merged_q_proj.out_features = num_heads * head_dim - module.merged_q_proj = merged_q_proj - - merged_k_proj = torch.nn.Linear(0, 0, False) - merged_k_proj.weight = torch.nn.Parameter(weight_k, requires_grad=False) - merged_k_proj.in_features = hidden_size - merged_k_proj.out_features = num_heads * head_dim - module.merged_k_proj = merged_k_proj - - del module.q_proj - del module.k_proj - model.apply(merge_qk_proj_func) + from ipex_llm.transformers.models.yuan import merge_qk + model.apply(merge_qk) # for bge-large if model.config.model_type == 'bert' and ( not model.config.is_decoder and @@ -715,16 +708,13 @@ def merge_qk_proj_func(module): model.apply(pre_compute_inv_freq) from ipex_llm.transformers.models.phi3 import split_mlp model.apply(split_mlp) - # for baichuan2 - if model.config.model_type == "baichuan" and model.config.vocab_size == 125696: - if model.config.hidden_size in [4096, 2048]: - # baichuan2-7B - from ipex_llm.transformers.models.baichuan2 import pre_compute_inv_freq - model.apply(pre_compute_inv_freq) # for qwen2 if model.config.model_type == "qwen2": from ipex_llm.transformers.models.qwen2 import merge_qkv model.apply(merge_qkv) + if model.config.model_type == "qwen2_moe": + from ipex_llm.transformers.models.qwen2_moe import merge_qkv + model.apply(merge_qkv) if model.config.model_type == "stablelm": # For stablelm-zephyr-3b and stablelm-2-zephyr-1_6b from ipex_llm.transformers.models.stablelm import merge_qkv @@ -803,7 +793,7 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True, if optimize_model: model = _optimize_post(model, lightweight_bmm) - if hasattr(model, "config") and \ + if hasattr(model, "config") and hasattr(model.config, "model_type") and \ model.config.model_type == "qwen" and hasattr(model.config, "visual"): # for Qwen-VL-Chat # Due to issue https://github.com/intel/intel-extension-for-pytorch/issues/454, @@ -836,7 +826,7 @@ def convert_bigdl_other_module(model, dtype): def convert_forward(m, target_m, new_forward): for _, sub_m in m.named_children(): - if isinstance(sub_m, target_m): + if sub_m.__class__ == target_m: bound_method = new_forward.__get__(sub_m, sub_m.__class__) setattr(sub_m, "forward", bound_method) convert_forward(sub_m, target_m, new_forward) @@ -853,7 +843,7 @@ def replace_RotaryEmbed(m, target_m, replace_embed): def replace_func(m, target_m, func_name, new_func): for _, sub_m in m.named_children(): - if isinstance(sub_m, target_m): + if sub_m.__class__ == target_m: bound_method = new_func.__get__(sub_m, sub_m.__class__) setattr(sub_m, func_name, bound_method) replace_func(sub_m, target_m, func_name, new_func) @@ -1045,6 +1035,24 @@ def _optimize_post(model, lightweight_bmm=False): module.SelfAttention, chatglm_attention_forward ) + elif (model.config.num_layers == 40 and hasattr(model.config, 'rope_ratio') + and model.config.rope_ratio == 500): + # glm-4-9b-chat + modeling_module_name = model.__class__.__module__ + module = importlib.import_module(modeling_module_name) + from ipex_llm.transformers.models.chatglm4 import chatglm4_attention_forward + from ipex_llm.transformers.models.chatglm4 import chatglm4_model_forward + from ipex_llm.transformers.models.chatglm2 import chatglm_rms_norm_forward + convert_forward(model, + module.SelfAttention, + chatglm4_attention_forward) + convert_forward(model, + module.ChatGLMModel, + chatglm4_model_forward) + convert_forward(model, + module.RMSNorm, + chatglm_rms_norm_forward) + elif "mpt" in model.config.model_type: if model.config.architectures is not None: modeling_module_name = model.__class__.__module__ @@ -1116,84 +1124,39 @@ def _optimize_post(model, lightweight_bmm=False): module.FalconAttention, falcon_attention_forward ) + elif model.config.model_type == "baichuan": + modeling_module_name = model.__class__.__module__ + module = importlib.import_module(modeling_module_name) + from ipex_llm.transformers.models.baichuan import baichuan_mlp_forward + convert_forward(model, module.MLP, baichuan_mlp_forward) - elif model.config.model_type == "baichuan" and model.config.vocab_size == 125696: - # baichuan2 if model.config.hidden_size in [4096, 2048]: - # baichuan2-7B - modeling_module_name = model.__class__.__module__ - module = importlib.import_module(modeling_module_name) - from ipex_llm.transformers.models.baichuan2 import baichuan_attention_forward_7b - from ipex_llm.transformers.models.baichuan2 import baichuan_mlp_forward - convert_forward(model, - module.Attention, - baichuan_attention_forward_7b - ) - convert_forward(model, - module.RMSNorm, - llama_rms_norm_forward) - convert_forward(model, - module.MLP, - baichuan_mlp_forward) - elif model.config.hidden_size == 5120: - # baichuan2-13B - modeling_module_name = model.__class__.__module__ - module = importlib.import_module(modeling_module_name) - from ipex_llm.transformers.models.baichuan2 import baichuan_attention_forward_13b - from ipex_llm.transformers.models.baichuan2 import baichuan_13b_rms_norm_forward - from ipex_llm.transformers.models.baichuan2 import baichuan_mlp_forward - from ipex_llm.transformers.models.baichuan2 import baichuan_13b_get_alibi_mask - convert_forward(model, - module.BaichuanAttention, - baichuan_attention_forward_13b - ) - # baichuan2-13B's RMSNorm is a little different - convert_forward(model, - module.RMSNorm, - baichuan_13b_rms_norm_forward) - convert_forward(model, - module.MLP, - baichuan_mlp_forward) - if hasattr(model.model, 'get_alibi_mask_orig'): - # deepspeed rewrite "get_alibi_mask" to support baichuan - # https://github.com/microsoft/DeepSpeed/pull/4721 - replace_func(model, - module.BaichuanModel, - "get_alibi_mask_orig", - baichuan_13b_get_alibi_mask) - else: - replace_func(model, - module.BaichuanModel, - "get_alibi_mask", - baichuan_13b_get_alibi_mask) - elif model.config.model_type == "baichuan": - # baichuan1 - if model.config.hidden_size == 4096: - # baichuan-7B - modeling_module_name = model.__class__.__module__ - module = importlib.import_module(modeling_module_name) + # baichuan-7B and baichuan2-7B from ipex_llm.transformers.models.baichuan import baichuan_attention_forward_7b - convert_forward(model, - module.Attention, - baichuan_attention_forward_7b - ) - convert_forward(model, - module.RMSNorm, - llama_rms_norm_forward) + convert_forward(model, module.Attention, baichuan_attention_forward_7b) + convert_forward(model, module.RMSNorm, llama_rms_norm_forward) elif model.config.hidden_size == 5120: - # baichuan-13B - modeling_module_name = model.__class__.__module__ - module = importlib.import_module(modeling_module_name) + # baichuan-13B and baichuan2-13B from ipex_llm.transformers.models.baichuan import baichuan_attention_forward_13b - from ipex_llm.transformers.models.baichuan2 import baichuan_13b_rms_norm_forward - convert_forward(model, - module.BaichuanAttention, - baichuan_attention_forward_13b - ) - # baichuan-13B's RMSNorm is a little different - convert_forward(model, - module.RMSNorm, - baichuan_13b_rms_norm_forward) + from ipex_llm.transformers.models.baichuan import baichuan_13b_rms_norm_forward + convert_forward(model, module.BaichuanAttention, baichuan_attention_forward_13b) + convert_forward(model, module.RMSNorm, baichuan_13b_rms_norm_forward) + + if model.config.vocab_size == 125696: + # baichaun2-13B + from ipex_llm.transformers.models.baichuan import baichuan_13b_get_alibi_mask + if hasattr(model.model, 'get_alibi_mask_orig'): + # deepspeed rewrite "get_alibi_mask" to support baichuan + # https://github.com/microsoft/DeepSpeed/pull/4721 + replace_func(model, + module.BaichuanModel, + "get_alibi_mask_orig", + baichuan_13b_get_alibi_mask) + else: + replace_func(model, + module.BaichuanModel, + "get_alibi_mask", + baichuan_13b_get_alibi_mask) elif model.config.model_type == "gpt_neox": from ipex_llm.transformers.models.gptneox import gptneox_attention_forward convert_forward(model, @@ -1291,13 +1254,16 @@ def _optimize_post(model, lightweight_bmm=False): convert_forward(model, module.Qwen2Attention, qwen2_attention_forward) + convert_forward(model, + module.Qwen2SdpaAttention, + qwen2_attention_forward) elif model.config.model_type == "qwen2_moe": # for Qwen1.5-MOE-A2.7B modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) from ipex_llm.transformers.models.qwen2_moe import qwen2moe_moeblock_forward - from ipex_llm.transformers.models.qwen2_moe import qwen2moe_attention_forward from ipex_llm.transformers.models.qwen2_moe import qwen2moe_model_forward + from ipex_llm.transformers.models.qwen2 import qwen2_attention_forward convert_forward(model, module.Qwen2MoeModel, qwen2moe_model_forward) @@ -1312,7 +1278,10 @@ def _optimize_post(model, lightweight_bmm=False): llama_mlp_forward) convert_forward(model, module.Qwen2MoeAttention, - qwen2moe_attention_forward) + qwen2_attention_forward) + convert_forward(model, + module.Qwen2MoeSdpaAttention, + qwen2_attention_forward) elif model.config.model_type == "cohere": # for CohereForAI/c4ai-command-r-v01 modeling_module_name = model.__class__.__module__ @@ -1510,7 +1479,8 @@ def safe_bmm_fwd(*args, **kwargs): modeling_module_name = model.__class__.__module__ module = importlib.import_module(modeling_module_name) from ipex_llm.transformers.models.gptbigcode import _attn_wrapper - from ipex_llm.transformers.models.gptbigcode import gptbigcode_attention_forward + from ipex_llm.transformers.models.gptbigcode import gptbigcode_attention_forward, \ + gptbigcode_sdpa_attention_forward convert_forward(model, module.GPTBigCodeAttention, gptbigcode_attention_forward) @@ -1519,6 +1489,18 @@ def safe_bmm_fwd(*args, **kwargs): module.GPTBigCodeAttention, "_attn", _attn) + try: + # for transformers 4.36+ + convert_forward(model, + module.GPTBigCodeSdpaAttention, + gptbigcode_sdpa_attention_forward) + sdpa_attn = _attn_wrapper(module.GPTBigCodeSdpaAttention._attn) + replace_func(model, + module.GPTBigCodeSdpaAttention, + "_attn", + sdpa_attn) + except AttributeError: + pass elif model.config.model_type == "starcoder2": # starcoder2 modeling_module_name = model.__class__.__module__ @@ -1598,4 +1580,22 @@ def safe_bmm_fwd(*args, **kwargs): module.StableLmModel, stablelm_model_forward ) + elif model.config.model_type == 'minicpm': + from ipex_llm.transformers.models.minicpm import minicpm_attention_forward + from ipex_llm.transformers.models.minicpm import minicpm_model_forward + modeling_module_name = model.__class__.__module__ + module = importlib.import_module(modeling_module_name) + convert_forward(model, + module.MiniCPMMLP, + llama_mlp_forward) + convert_forward(model, + module.MiniCPMRMSNorm, + llama_rms_norm_forward) + convert_forward(model, + module.MiniCPMAttention, + minicpm_attention_forward) + convert_forward(model, + module.MiniCPMModel, + minicpm_model_forward) + return model diff --git a/python/llm/src/ipex_llm/transformers/lookup.py b/python/llm/src/ipex_llm/transformers/lookup.py index 1eaaf83a5ea..36815902445 100644 --- a/python/llm/src/ipex_llm/transformers/lookup.py +++ b/python/llm/src/ipex_llm/transformers/lookup.py @@ -99,6 +99,10 @@ def generate( GenerationMixin.generate = generate +def tensor2key(key_tensor: torch.LongTensor): + return tuple(key_tensor.tolist()) + + # This class is copied from https://github.com/huggingface/transformers/blob/main/src # /transformers/generation/candidate_generator.py class PromptLookupCandidateGenerator(): @@ -133,9 +137,34 @@ def __init__( self.max_candidates = 9 self.min_candidates = 0 + self.lookup_table = {} invalidInputError(self.max_matching_ngram_size > 0 and self.num_output_tokens > 0, "Invalid max_matching_ngram_size or num_output_tokens") + def init_look_up_table(self, + input_ids: torch.LongTensor): + for ngram_size in range(self.max_matching_ngram_size, 0, -1): + # Create sliding windows of size ngram_size + windows = input_ids.unfold(dimension=1, size=ngram_size, step=1) + for idx in range(windows.size(1)): + window = tensor2key(windows[0, idx]) + if window not in self.lookup_table: + self.lookup_table[window] = idx + + def update_look_up_table(self, + new_input_ids: torch.LongTensor): + # Maintain a look up table + window = tensor2key(new_input_ids[0, -self.max_matching_ngram_size:]) + for ngram_size in range(self.max_matching_ngram_size): + if window[ngram_size:] not in self.lookup_table: + self.lookup_table[window[ngram_size:]] = \ + new_input_ids.size(1)-self.max_matching_ngram_size+ngram_size + + def get_n_gram_idx(self, + ngram_tensor: torch.LongTensor): + key = tensor2key(ngram_tensor) + return self.lookup_table[key] + def get_candidates(self, input_ids: torch.LongTensor)-> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]: @@ -156,31 +185,20 @@ def get_candidates(self, input_length = input_ids.size(1) chosen_ids = None - match_found = False for ngram_size in range(min(self.max_matching_ngram_size, input_length - 1), 0, -1): - # Create sliding windows of size ngram_size - windows = input_ids.unfold(dimension=1, size=ngram_size, step=1) - # Convert ngram to a tensor for comparison ngram_tensor = input_ids[0, -ngram_size:] - # Find where the windows match the ngram - matches = (windows == ngram_tensor).all(dim=2) - - # Get the indices of matches - match_indices = matches.nonzero(as_tuple=True)[1] + # # Get the indices of matches + idx = self.get_n_gram_idx(ngram_tensor) # Iterate through match indices to find a valid continuation - for idx in match_indices: - start_idx = idx + ngram_size - end_idx = start_idx + self.num_output_tokens - end_idx = min(end_idx, input_length) - - if start_idx < end_idx: - chosen_ids = input_ids[0, start_idx:end_idx] - match_found = True - break - if match_found: + start_idx = idx + ngram_size + end_idx = start_idx + self.num_output_tokens + end_idx = min(end_idx, input_length) + + if start_idx < end_idx: + chosen_ids = input_ids[0, start_idx:end_idx] break if chosen_ids is None or len(chosen_ids) == 0: @@ -267,6 +285,9 @@ def lookup_generate(self, else: output_ids = greedy(logits) input_ids = torch.cat((input_ids, output_ids), dim=-1) + + candidates_generator.init_look_up_table(input_ids) + past_key_values = output['past_key_values'] step += 1 if self.device.type == 'xpu': @@ -319,9 +340,13 @@ def lookup_generate(self, # Drafts start from [1, k] # Verified output start from [0, k - 1] # including the one generated by the base model + n_matches = ((output_ids[:, :-1] != verify_input_ids[:, 1:]) .cumsum(-1) == 0).sum(-1).item() + max_matched = n_matches + 1 + mot = time.time() + self.match_time.append(mot-toc) max_of_max_matched = output_ids.size(1) # Accept number is max_matched, min is 1 @@ -343,9 +368,12 @@ def lookup_generate(self, accept_rate) input_ids = torch.cat((input_ids, output_ids), dim=-1) + candidates_generator.update_look_up_table(input_ids) step += output_ids.size(1) step_verify += 1 + pot = time.time() + self.post_time.append(pot-mot) # Stop on eos and remove content after eos output_ids_list = output_ids[0].tolist() diff --git a/python/llm/src/ipex_llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py index 011ec7ed4af..b429c3ce08e 100644 --- a/python/llm/src/ipex_llm/transformers/low_bit_linear.py +++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py @@ -80,6 +80,7 @@ Q4_K = ggml_tensor_qtype["q4_k"] Q6_K = ggml_tensor_qtype["q6_k"] Q5_K = ggml_tensor_qtype["q5_k"] +FP6_K = ggml_tensor_qtype["fp6_k"] # For sym_int4 @@ -220,7 +221,7 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int, if not convert_shape_only and device != 'meta': dst = ctypes.c_void_p(dst_tensor.data.data_ptr()) hist = (ctypes.c_int64 * 16)() - if qtype not in [IQ2_XXS, IQ2_XS, Q2_K, IQ1_S, Q4_K, Q6_K, Q5_K]: + if qtype not in [IQ2_XXS, IQ2_XS, Q2_K, IQ1_S, Q4_K, Q6_K, Q5_K, FP6_K]: ggml.ggml_quantize_tensor(src, dst, qtype, n, k, hist) else: if imatrix is not None: @@ -244,7 +245,8 @@ def ggml_q_format_convet_cpu2xpu(tensor: torch.Tensor, num_elem: int, qtype: int src = ctypes.c_void_p(tensor.data.data_ptr()) - if qtype in [SYM_INT4, ASYM_INT4, SYM_INT8, NF4, NF3, FP4, FP6, FP8E4, FP8E5, Q4_K]: + if qtype in [SYM_INT4, ASYM_INT4, SYM_INT8, NF4, NF3, FP4, FP6, FP8E4, FP8E5, + Q4_K, Q6_K, FP6_K]: dst_tensor = torch.empty_like(tensor) elif qtype == ggml_tensor_qtype["sym_int5"]: QK = ggml.ggml_qk_size(qtype) @@ -269,7 +271,8 @@ def ggml_q_format_convet_xpu2cpu(tensor: torch.Tensor, num_elem: int, qtype: int src = ctypes.c_void_p(tensor.data.data_ptr()) - if qtype in [SYM_INT4, ASYM_INT4, SYM_INT8, NF4, NF3, FP4, FP6, FP8E4, FP8E5, Q4_K]: + if qtype in [SYM_INT4, ASYM_INT4, SYM_INT8, NF4, NF3, FP4, FP6, FP8E4, FP8E5, + Q4_K, Q6_K, FP6_K]: dst_tensor = torch.empty_like(tensor) elif qtype == ggml_tensor_qtype["sym_int5"]: QK = ggml.ggml_qk_size(ggml_tensor_qtype["asym_int5"]) diff --git a/python/llm/src/ipex_llm/transformers/model.py b/python/llm/src/ipex_llm/transformers/model.py index 21a1b1ea357..70c2f0d9b2b 100644 --- a/python/llm/src/ipex_llm/transformers/model.py +++ b/python/llm/src/ipex_llm/transformers/model.py @@ -154,7 +154,7 @@ def from_pretrained(cls, ``'nf4'``, ``'fp4'``, ``'fp8'``, ``'fp8_e4m3'``, ``'fp8_e5m2'``, ``'fp6'``, ``'gguf_iq2_xxs'``, ``'gguf_iq2_xs'``, ``'gguf_iq1_s'``, ``'gguf_q4k_m'``, ``'gguf_q4k_s'``, - ``'fp16'``, ``'bf16'``, + ``'fp16'``, ``'bf16'``, ``'fp6_k'``, ``'sym_int4'`` means symmetric int 4, ``'asym_int4'`` means asymmetric int 4, ``'nf4'`` means 4-bit NormalFloat, etc. Relevant low bit optimizations will be applied to the model. @@ -428,7 +428,7 @@ def load_convert(cls, q_k, optimize_model, *args, **kwargs): f"Unknown load_in_low_bit value: {q_k}, expected:" f" sym_int4, asym_int4, sym_int5, asym_int5, sym_int8, nf3, nf4, " f"fp4, fp6, fp8, fp8_e4m3, fp8_e5m2, fp16, bf16, gguf_iq2_xxs, " - f"gguf_iq2_xs, gguf_iq1_s, q2_k, q4_k, q5_k, q6_k, " + f"gguf_iq2_xs, gguf_iq1_s, q2_k, q4_k, q5_k, q6_k, fp6_k" f"gguf_q4k_s, gguf_q4k_m, mixed_fp4 or mixed_fp8.") if q_k in ggml_tensor_qtype: qtype = ggml_tensor_qtype[q_k] diff --git a/python/llm/src/ipex_llm/transformers/modelling_bigdl.py b/python/llm/src/ipex_llm/transformers/modelling_bigdl.py index e81b6fdc00d..d068e30ee68 100644 --- a/python/llm/src/ipex_llm/transformers/modelling_bigdl.py +++ b/python/llm/src/ipex_llm/transformers/modelling_bigdl.py @@ -42,8 +42,7 @@ def from_pretrained(cls, :param pretrained_model_name_or_path: Path for converted BigDL-LLM optimized ggml binary checkpoint. The checkpoint should be converted by ``ipex_llm.llm_convert``. :param model_family: The model family of the pretrained checkpoint. - Currently we support ``"llama"``, ``"bloom"``, ``"gptneox"``, ``"starcoder"`` - and ``"chatglm"``. + Currently we support ``"llama"``, ``"bloom"``, ``"gptneox"``, ``"starcoder"``. :param dtype: Which quantized precision will be converted. Now only `int4` and `int8` are supported, and `int8` only works for `llama` , `gptneox` and `starcoder`. @@ -58,9 +57,9 @@ def from_pretrained(cls, """ logging.warning("BigdlNativeForCausalLM has been deprecated, " "please switch to the new CausalLM API for sepcific models.") - invalidInputError(model_family in ['llama', 'gptneox', 'bloom', 'starcoder', 'chatglm'], + invalidInputError(model_family in ['llama', 'gptneox', 'bloom', 'starcoder'], "Now we only support model family: 'llama', 'gptneox', 'bloom'," - " 'starcoder', 'chatglm', '{}' is not in the list.".format(model_family)) + " 'starcoder', '{}' is not in the list.".format(model_family)) invalidInputError(dtype.lower() in ['int4', 'int8'], "Now we only support int4 and int8 as date type for weight") @@ -78,9 +77,6 @@ def from_pretrained(cls, elif model_family == 'starcoder': from ipex_llm.ggml.model.starcoder import Starcoder return Starcoder(model_path=ggml_model_path, **kwargs) - elif model_family == 'chatglm': - from ipex_llm.ggml.model.chatglm import ChatGLM - return ChatGLM(model_path=ggml_model_path, **kwargs) class _BaseGGMLClass: @@ -110,9 +106,9 @@ def from_pretrained(cls, :return: a model instance """ try: - module = importlib.import_module(cls.GGML_Module) - class_ = getattr(module, cls.GGML_Model) if native: + module = importlib.import_module(cls.GGML_Module) + class_ = getattr(module, cls.GGML_Model) invalidInputError(dtype.lower() in ['int4', 'int8'], "Now we only support int4 and int8 as date type for weight") ggml_model_path = pretrained_model_name_or_path diff --git a/python/llm/src/ipex_llm/transformers/models/baichuan.py b/python/llm/src/ipex_llm/transformers/models/baichuan.py index 8bcdb637435..c74e97543ef 100644 --- a/python/llm/src/ipex_llm/transformers/models/baichuan.py +++ b/python/llm/src/ipex_llm/transformers/models/baichuan.py @@ -14,184 +14,64 @@ # limitations under the License. # This file is adapted from -# https://huggingface.co/baichuan-inc/Baichuan-7B/blob/c1a5c7d5b7f50ecc51bb0e08150a9f12e5656756/modeling_baichuan.py +# https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/cb7fc748b78b7ea99772e4cf76db155729ce774e/modeling_baichuan.py # and -# https://huggingface.co/baichuan-inc/Baichuan-13B-Chat/blob/a4a558127068f2ce965aa56aeb826bf501a68970/modeling_baichuan.py - +# https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/c6f8592a60b4ad73c210b28dd2ab3cca51abbf93/modeling_baichuan.py import math -from typing import List, Optional, Tuple, Union +from typing import Optional, Tuple import torch import torch.utils.checkpoint -from torch import nn -import torch.nn.functional as F -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss -from ipex_llm.utils.common import invalidInputError -from ipex_llm.transformers.models.utils import use_flash_attention, use_sdp -from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, \ - append_kv_cache, is_enough_kv_cache_room_4_31 -from ipex_llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \ - restore_fp8_kv_cache, use_quantize_kv_cache -from ipex_llm.transformers.models.utils import rotate_half, apply_rotary_pos_emb -from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu - -import os - -KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) +from torch.nn import functional as F +from ipex_llm.transformers.models.utils import use_quantize_kv_cache, restore_fp8_kv_cache +from ipex_llm.transformers.models.utils import update_past_key_value +from ipex_llm.transformers.models.utils import should_use_fuse_rope +from ipex_llm.transformers.models.utils import use_flash_attention, use_sdp, use_sdp_causal +from ipex_llm.transformers.models.utils import apply_rotary_pos_emb, SILU +from ipex_llm.transformers.models.utils import mlp_fusion_check +import warnings -def baichuan_attention_forward_7b( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if use_quantize_kv_cache(self.W_pack, hidden_states): - forward_function = baichuan_attention_forward_7b_quantized - else: - forward_function = baichuan_attention_forward_7b_origin - return forward_function( - self=self, - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache - ) +def pre_compute_inv_freq(module: torch.nn.Module): + if module.__class__.__name__ == "RotaryEmbedding": + inv_freq = module.inv_freq + del module.inv_freq + module.register_buffer("inv_freq", inv_freq, persistent=False) -def baichuan_attention_forward_7b_quantized( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - bsz, q_len, _ = hidden_states.size() - device = hidden_states.device - - proj = self.W_pack(hidden_states) - proj = proj.unflatten(-1, (3, self.hidden_size)).unsqueeze(0).transpose(0, -2).squeeze(-2) - # batch_size x source_len x hidden_size - query_states = proj[0].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - # batch_size x target_len x head_size - key_states = proj[1].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - # batch_size x source_len x hidden_size - value_states = proj[2].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value[0].shape[-2] - if query_states.device.type == "xpu" and not (self.training and query_states.requires_grad): - query_states, key_states = apply_rotary_pos_emb_no_cache_xpu(query_states, - key_states, - position_ids, - "baichuan") - else: - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, - cos, sin, position_ids, "baichuan") - # [bsz, nh, t, hd] - - if past_key_value is None: - attn_weights = torch.matmul(query_states, - key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): - invalidInputError( - False, - f"Attention weights should be of size " - f"{(bsz, self.num_heads, q_len, kv_seq_len)}" - f", but is {attn_weights.size()}" - ) - - if attention_mask is not None: - invalidInputError( - attention_mask.size() == (bsz, 1, q_len, kv_seq_len), - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, " - f"but is {attention_mask.size()}" - ) - attn_weights = attn_weights + attention_mask - attn_weights = torch.max(attn_weights, - torch.tensor(torch.finfo(attn_weights.dtype).min)) - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, - dtype=torch.float32).to(query_states.dtype) - attn_output = torch.matmul(attn_weights, value_states) - kv_seq_len = key_states.shape[-2] - if use_cache: - k_cache, v_cache = init_fp8_kv_cache( - bsz, self.num_heads, kv_seq_len, self.head_dim, - device=device - ) - key_states, value_states = append_fp8_kv_cache(k_cache, v_cache, key_states, - value_states) - past_key_value = (key_states, value_states) - else: - k_cache, v_cache = past_key_value - key_states, value_states = append_fp8_kv_cache(k_cache, v_cache, - key_states, value_states) - kv_seq_len = key_states.shape[-2] - past_key_value = (key_states, value_states) - if query_states.size(2) != 1 or query_states.device.type != 'xpu': - key_states, value_states = restore_fp8_kv_cache(key_states, value_states, - query_states.dtype) - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) - attn_weights = attn_weights / math.sqrt(self.head_dim) - - if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): - invalidInputError( - False, - f"Attention weights should be of size " - f"{(bsz, self.num_heads, q_len, kv_seq_len)}" - f", but is {attn_weights.size()}" - ) - - if attention_mask is not None: - invalidInputError( - attention_mask.size() == (bsz, 1, q_len, kv_seq_len), - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, " - f"but is {attention_mask.size()}" - ) - attn_weights = attn_weights + attention_mask - attn_weights = torch.max(attn_weights, - torch.tensor(torch.finfo(attn_weights.dtype).min)) - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, - dtype=torch.float32).to(query_states.dtype) - attn_output = torch.matmul(attn_weights, value_states) - else: - import xe_addons - attn_output = xe_addons.sdp_fp8(query_states, key_states, value_states, - attention_mask) - attn_weights = None - - invalidInputError( - attn_output.size() == (bsz, self.num_heads, q_len, self.head_dim), - f"`attn_output` should be of size " - f"{(bsz, self.num_heads, q_len, self.head_dim)}," - f"but is {attn_output.size()}" - ) - - attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) +def baichuan_13b_rms_norm_forward(self, hidden_states): + if hidden_states.device.type == "xpu" and not (self.training or hidden_states.requires_grad): + import xe_addons + x_2d = hidden_states.reshape(-1, hidden_states.size(-1)).contiguous() + output = xe_addons.rms_norm(self.weight, x_2d, self.epsilon) + return output.reshape(hidden_states.shape) - attn_output = self.o_proj(attn_output) + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.epsilon) + return self.weight * hidden_states.to(input_dtype) - if not output_attentions: - attn_weights = None - return attn_output.to(hidden_states.dtype), attn_weights, past_key_value +def baichuan_mlp_forward( + self, + x: torch.Tensor, +) -> torch.Tensor: + x_2d = x.view(-1, x.shape[-1]) + qtype = getattr(self.gate_proj, "qtype", None) + if mlp_fusion_check(x_2d, qtype, self.training) and not self.down_proj.enable_xetla: + import xe_linear + if not x_2d.is_contiguous(): + x_2d = x_2d.contiguous() + return self.down_proj(xe_linear.mlp_forward_xpu( + x_2d, self.gate_proj.weight.data, self.up_proj.weight.data, + x_2d.shape[0], x_2d.shape[1], self.gate_proj.out_len, + SILU, qtype + )) + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) -def baichuan_attention_forward_7b_origin( +def baichuan_attention_forward_7b( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, @@ -199,118 +79,82 @@ def baichuan_attention_forward_7b_origin( past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: bool = False, use_cache: bool = False, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: +): bsz, q_len, _ = hidden_states.size() device = hidden_states.device - proj = self.W_pack(hidden_states) - proj = proj.unflatten(-1, (3, self.hidden_size)).unsqueeze(0).transpose(0, -2).squeeze(-2) - # batch_size x source_len x hidden_size - query_states = proj[0].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - # batch_size x target_len x head_size - key_states = proj[1].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - # batch_size x source_len x hidden_size - value_states = proj[2].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - enough_kv_room = True + qkv = self.W_pack(hidden_states) + qkv = qkv.view(bsz, q_len, self.num_heads * 3, self.head_dim) + qkv = qkv.transpose(1, 2) + query_states, key_states, value_states = qkv.split([self.num_heads, + self.num_heads, + self.num_heads], dim=1) + + kv_seq_len = key_states.shape[2] if past_key_value is not None: - enough_kv_room = is_enough_kv_cache_room_4_31(past_key_value, seq_len=kv_seq_len) - kv_seq_len += past_key_value[0].shape[-2] - if query_states.device.type == "xpu" and not (self.training and query_states.requires_grad): - query_states, key_states = apply_rotary_pos_emb_no_cache_xpu(query_states, - key_states, - position_ids, - "baichuan") + kv_seq_len += past_key_value[0].shape[2] + + # IPEX-LLM OPT: fuse rope + if should_use_fuse_rope(hidden_states, position_ids, self.training): + import xe_addons + xe_addons.rotary_half_inplaced(self.rotary_emb.inv_freq, position_ids, + query_states, key_states) else: cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids, "baichuan") - # [bsz, nh, t, hd] - - # if past_key_value is not None: - # # reuse k, v, self_attention - # key_states = torch.cat([past_key_value[0], key_states], dim=2) - # value_states = torch.cat([past_key_value[1], value_states], dim=2) - if past_key_value is not None: - # reuse k, v, self_attention - cache_k = past_key_value[0] - cache_v = past_key_value[1] - if not enough_kv_room: - # allocate new - new_cache_k, new_cache_v = extend_kv_cache(bsz, - self.num_heads, - self.head_dim, - cache_k.size(2), - kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH, - dtype=cache_k.dtype, - device=device) - new_cache_k[:] = cache_k - new_cache_v[:] = cache_v - cache_k = new_cache_k - cache_v = new_cache_v - - key_states, value_states = append_kv_cache(cache_k, cache_v, key_states, value_states) - - elif use_cache: - max_cache_length = kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH - new_key_states, new_value_states = init_kv_cache(bsz, - self.num_heads, - self.head_dim, - kv_seq_len, - max_cache_length, - dtype=key_states.dtype, - device=device) - new_key_states[:] = key_states - new_value_states[:] = value_states - key_states = new_key_states - value_states = new_value_states - + query_states = query_states.to(hidden_states.dtype) + key_states = key_states.to(hidden_states.dtype) + + # IPEX-LLM OPT: kv cache and quantize kv + use_quantize_kv = use_quantize_kv_cache(self.W_pack, hidden_states) + key_states, value_states = update_past_key_value( + past_key_value, key_states, value_states, + kv_seq_len, use_quantize_kv, device + ) past_key_value = (key_states, value_states) if use_cache else None + if self.training: + warnings.warn("xops is not supported on Intel GPU, so just use normal implementation") + + # IPEX-LLM OPT: sdp + attn_weights = None if not self.training and not hidden_states.requires_grad and \ use_flash_attention(query_states, key_states, attention_mask): - attn_output = F.scaled_dot_product_attention(query_states.to(device, dtype=torch.float16), - key_states.to(device, dtype=torch.float16), - value_states.to(device, dtype=torch.float16), - is_causal=True) - attn_weights = None - elif not self.training and not hidden_states.requires_grad and \ - use_sdp(q_len, key_states.shape[2], self.head_dim, query_states): + attn_output = F.scaled_dot_product_attention(query_states.to(dtype=torch.float16), + key_states.to(dtype=torch.float16), + value_states.to(dtype=torch.float16), + is_causal=True).to(hidden_states.dtype) + elif use_sdp(q_len, kv_seq_len, self.head_dim, query_states): import xe_addons - attn_output = xe_addons.sdp(query_states, key_states, value_states, - attention_mask) - attn_output = attn_output.view(query_states.shape) - attn_weights = None + if use_quantize_kv: + attn_output = xe_addons.sdp_fp8(query_states, key_states, value_states, + attention_mask) + else: + attn_output = xe_addons.sdp(query_states, key_states, value_states, + attention_mask) + elif use_sdp_causal(q_len, kv_seq_len, self.head_dim, query_states, self.training): + import xe_addons + if use_quantize_kv: + attn_output = xe_addons.sdp_fp8_causal(query_states, key_states, + value_states, attention_mask) + else: + attn_output = xe_addons.sdp_causal(query_states, key_states, + value_states, attention_mask) else: + if use_quantize_kv: + key_states, value_states = restore_fp8_kv_cache(key_states, value_states, + query_states.dtype) attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - - if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): - invalidInputError(False, - f"Attention weights should be of size " - f"{(bsz, self.num_heads, q_len, kv_seq_len)}" - f", but is {attn_weights.size()}") - if attention_mask is not None: - invalidInputError(attention_mask.size() == (bsz, 1, q_len, kv_seq_len), - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, " - f"but is {attention_mask.size()}") attn_weights = attn_weights + attention_mask - attn_weights = torch.max(attn_weights, - torch.tensor(torch.finfo(attn_weights.dtype).min)) - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, - dtype=torch.float32).to(query_states.dtype) + attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1, + dtype=torch.float32).to(value_states.dtype) attn_output = torch.matmul(attn_weights, value_states) - invalidInputError(attn_output.size() == (bsz, self.num_heads, q_len, self.head_dim), - f"`attn_output` should be of size " - f"{(bsz, self.num_heads, q_len, self.head_dim)}," - f"but is {attn_output.size()}") - - attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.transpose(1, 2).contiguous() attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) attn_output = self.o_proj(attn_output) @@ -318,7 +162,7 @@ def baichuan_attention_forward_7b_origin( if not output_attentions: attn_weights = None - return attn_output.to(hidden_states.dtype), attn_weights, past_key_value + return attn_output, attn_weights, past_key_value def baichuan_attention_forward_13b( @@ -329,101 +173,57 @@ def baichuan_attention_forward_13b( output_attentions: bool = False, use_cache: bool = False, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if use_quantize_kv_cache(self.W_pack, hidden_states): - forward_function = baichuan_attention_forward_13b_quantized - else: - forward_function = baichuan_attention_forward_13b_origin - return forward_function( - self=self, - hidden_states=hidden_states, - attention_mask=attention_mask, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache - ) - - -def baichuan_attention_forward_13b_quantized( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - bsz, q_len, _ = hidden_states.size() device = hidden_states.device - proj = self.W_pack(hidden_states) - proj = proj.unflatten(-1, (3, self.hidden_size)).unsqueeze(0).transpose(0, -2).squeeze(-2) - query_states = proj[0].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = proj[1].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - value_states = proj[2].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + qkv = self.W_pack(hidden_states) + qkv = qkv.view(bsz, q_len, self.num_heads * 3, self.head_dim) + qkv = qkv.transpose(1, 2) + query_states, key_states, value_states = qkv.split([self.num_heads, + self.num_heads, + self.num_heads], dim=1) - kv_seq_len = key_states.shape[-2] + kv_seq_len = key_states.shape[2] if past_key_value is not None: - kv_seq_len += past_key_value[0].shape[-2] + kv_seq_len += past_key_value[0].shape[2] - if past_key_value is None: - attn_weights = torch.matmul(query_states, - key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - if attention_mask is not None: - if q_len == 1: # inference with cache - if len(attention_mask.size()) == 4: - attention_mask = attention_mask[:, :, -1:, :] - else: - attention_mask = attention_mask[:, -1:, :] - attn_weights = attn_weights + attention_mask - attn_weights = torch.max(attn_weights, - torch.tensor(torch.finfo(attn_weights.dtype).min)) + # IPEX-LLM OPT: kv cache and quantize kv + use_quantize_kv = use_quantize_kv_cache(self.W_pack, hidden_states) + key_states, value_states = update_past_key_value( + past_key_value, key_states, value_states, + kv_seq_len, use_quantize_kv, device + ) + past_key_value = (key_states, value_states) if use_cache else None - attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1) + if self.training: + warnings.warn("xops is not supported on Intel GPU, so just use normal implementation") - attn_output = torch.matmul(attn_weights, value_states) - kv_seq_len = key_states.shape[-2] - if use_cache: - k_cache, v_cache = init_fp8_kv_cache( - bsz, self.num_heads, kv_seq_len, self.head_dim, - device=device - ) - key_states, value_states = append_fp8_kv_cache(k_cache, v_cache, - key_states, value_states) - past_key_value = (key_states, value_states) + if attention_mask is not None: + if len(attention_mask.size()) == 4: + attention_mask = attention_mask[:, :, -q_len:, :] + else: + attention_mask = attention_mask[None, :, -q_len:, :] + + if use_sdp(q_len, kv_seq_len, self.head_dim, query_states): + import xe_addons + if use_quantize_kv: + attn_output = xe_addons.sdp_fp8(query_states, key_states, value_states, + attention_mask) + else: + attn_output = xe_addons.sdp(query_states, key_states, value_states, + attention_mask) + attn_weights = None else: - k_cache, v_cache = past_key_value - key_states, value_states = append_fp8_kv_cache(k_cache, v_cache, - key_states, value_states) - kv_seq_len = key_states.shape[-2] - past_key_value = (key_states, value_states) - if query_states.size(2) != 1 or query_states.device.type != 'xpu': + if use_quantize_kv: key_states, value_states = restore_fp8_kv_cache(key_states, value_states, query_states.dtype) - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) - else: - import xe_addons - attn_weights = xe_addons.query_key_fp8_matmul(query_states, key_states) - - attn_weights = attn_weights / math.sqrt(self.head_dim) - + attn_weights = torch.matmul(query_states, + key_states.transpose(2, 3)) / math.sqrt(self.head_dim) if attention_mask is not None: - if q_len == 1: # inference with cache - if len(attention_mask.size()) == 4: - attention_mask = attention_mask[:, :, -1:, :] - else: - attention_mask = attention_mask[:, -1:, :] attn_weights = attn_weights + attention_mask - attn_weights = torch.max(attn_weights, - torch.tensor(torch.finfo(attn_weights.dtype).min)) - + attn_weights = attn_weights.to(query_states.dtype) attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1) - if query_states.size(2) != 1 or query_states.device.type != 'xpu': - attn_output = torch.matmul(attn_weights, value_states) - else: - import xe_addons - attn_output = xe_addons.attn_value_fp8_matmul(attn_weights, - value_states) - + attn_output = torch.matmul(attn_weights.to(dtype=value_states.dtype), value_states) attn_output = attn_output.transpose(1, 2) attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) attn_output = self.o_proj(attn_output) @@ -434,90 +234,92 @@ def baichuan_attention_forward_13b_quantized( return attn_output, attn_weights, past_key_value -def baichuan_attention_forward_13b_origin( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - - bsz, q_len, _ = hidden_states.size() - device = hidden_states.device - - proj = self.W_pack(hidden_states) - proj = proj.unflatten(-1, (3, self.hidden_size)).unsqueeze(0).transpose(0, -2).squeeze(-2) - query_states = proj[0].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = proj[1].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - value_states = proj[2].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - enough_kv_room = True - if past_key_value is not None: - enough_kv_room = is_enough_kv_cache_room_4_31(past_key_value, seq_len=kv_seq_len) - kv_seq_len += past_key_value[0].shape[-2] - - # if past_key_value is not None: - # # reuse k, v, self_attention - # key_states = torch.cat([past_key_value[0], key_states], dim=2) - # value_states = torch.cat([past_key_value[1], value_states], dim=2) - if past_key_value is not None: - # reuse k, v, self_attention - cache_k = past_key_value[0] - cache_v = past_key_value[1] - if not enough_kv_room: - # allocate new - new_cache_k, new_cache_v = extend_kv_cache(bsz, - self.num_heads, - self.head_dim, - cache_k.size(2), - kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH, - dtype=cache_k.dtype, - device=device) - new_cache_k[:] = cache_k - new_cache_v[:] = cache_v - cache_k = new_cache_k - cache_v = new_cache_v - - key_states, value_states = append_kv_cache(cache_k, cache_v, key_states, value_states) - - elif use_cache: - max_cache_length = kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH - new_key_states, new_value_states = init_kv_cache(bsz, - self.num_heads, - self.head_dim, - kv_seq_len, - max_cache_length, - dtype=key_states.dtype, - device=device) - new_key_states[:] = key_states - new_value_states[:] = value_states - key_states = new_key_states - value_states = new_value_states - - past_key_value = (key_states, value_states) if use_cache else None - - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - - if attention_mask is not None: - if q_len == 1: # inference with cache - if len(attention_mask.size()) == 4: - attention_mask = attention_mask[:, :, -1:, :] - else: - attention_mask = attention_mask[:, -1:, :] - attn_weights = attn_weights + attention_mask - attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)) - - attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1) - - attn_output = torch.matmul(attn_weights, value_states) +def _get_interleave(n): + def _get_interleave_power_of_2(n): + start = 2 ** (-(2 ** -(math.log2(n) - 3))) + ratio = start + return [start * ratio**i for i in range(n)] - attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output.to(hidden_states.dtype), attn_weights, past_key_value + if math.log2(n).is_integer(): + return _get_interleave_power_of_2(n) + else: + closest_power_of_2 = 2 ** math.floor(math.log2(n)) + return ( + _get_interleave_power_of_2(closest_power_of_2) + + _get_interleave(2 * closest_power_of_2)[0::2][: n - closest_power_of_2] + ) + + +def _fill_with_neg_inf(t): + """FP16-compatible function that fills a tensor with -inf.""" + return t.float().fill_(float("-inf")).type_as(t) + + +def _buffered_future_mask(tensor, maxpos, alibi, attn_heads): + _future_mask = torch.triu(_fill_with_neg_inf(torch.zeros([maxpos, maxpos])), 1) + _future_mask = _future_mask.unsqueeze(0) + alibi + new_future_mask = _future_mask.to(tensor) + return new_future_mask[: tensor.shape[0] * attn_heads, :maxpos, :maxpos] + + +def baichuan_13b_gen_alibi_mask(tensor, n_head, max_pos): + slopes = torch.Tensor(_get_interleave(n_head)).to(tensor.dtype) + position_point = torch.arange(max_pos) - max_pos + 1 + position_point = position_point.unsqueeze(0).unsqueeze(0).expand(n_head, -1, -1) + diag = torch.diag(position_point[0]) + position_point = position_point - diag.unsqueeze(0).unsqueeze(0).transpose(-1, -2) + alibi = slopes.unsqueeze(1).unsqueeze(1) * position_point + alibi = alibi.view(n_head, 1, max_pos) + alibi_mask = torch.triu( + _fill_with_neg_inf(torch.zeros([max_pos, max_pos])), 1).to(tensor.dtype) + alibi_mask = alibi_mask.unsqueeze(0) + alibi + if tensor.device.type == "xpu": + alibi_mask = alibi_mask.to(tensor.device) + return alibi_mask + + +MASK_BLOCK_SIZE = 512 + + +def baichuan_13b_get_alibi_mask(self, tensor, seq_length_with_past): + if self.training: + slopes = torch.Tensor(_get_interleave(self.n_head)) + position_point = ( + torch.arange(seq_length_with_past) - seq_length_with_past + 1 + ) + position_point = ( + position_point.unsqueeze(0) + .unsqueeze(0) + .expand(self.n_head, seq_length_with_past, -1) + ) + diag = torch.diag(position_point[0]) + position_point = position_point - diag.unsqueeze(0).unsqueeze(0).transpose( + -1, -2 + ) + alibi = slopes.unsqueeze(1).unsqueeze(1) * position_point + mask = _buffered_future_mask( + tensor, seq_length_with_past, alibi, self.n_head + ) + else: + if self.first_run: + # Override the default max_cache_pos=4096 for memory considerations + self.max_cache_pos = seq_length_with_past + MASK_BLOCK_SIZE + self.first_run = False + self.register_buffer( + "future_mask", + baichuan_13b_gen_alibi_mask(tensor, self.n_head, self.max_cache_pos), + persistent=False, + ) + if seq_length_with_past > self.max_cache_pos: + # When max_cache_pos is not enough for current sequence length, + # increase by MASK_BLOCK_SIZE and recalculate future_mask. + self.max_cache_pos = seq_length_with_past + MASK_BLOCK_SIZE + self.register_buffer( + "future_mask", + baichuan_13b_gen_alibi_mask(tensor, self.n_head, self.max_cache_pos), + persistent=False, + ) + mask = self.future_mask[ + : self.n_head, :seq_length_with_past, :seq_length_with_past + ] + return mask diff --git a/python/llm/src/ipex_llm/transformers/models/baichuan2.py b/python/llm/src/ipex_llm/transformers/models/baichuan2.py deleted file mode 100644 index 31d982a759d..00000000000 --- a/python/llm/src/ipex_llm/transformers/models/baichuan2.py +++ /dev/null @@ -1,324 +0,0 @@ -# -# Copyright 2016 The BigDL Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This file is adapted from -# https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/cb7fc748b78b7ea99772e4cf76db155729ce774e/modeling_baichuan.py -# and -# https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/c6f8592a60b4ad73c210b28dd2ab3cca51abbf93/modeling_baichuan.py - -import math -from typing import Optional, Tuple -import torch -import torch.utils.checkpoint -from torch.nn import functional as F -from ipex_llm.transformers.models.utils import use_quantize_kv_cache, restore_fp8_kv_cache -from ipex_llm.transformers.models.utils import update_past_key_value -from ipex_llm.transformers.models.utils import should_use_fuse_rope -from ipex_llm.transformers.models.utils import use_flash_attention, use_sdp, use_sdp_causal -from ipex_llm.transformers.models.utils import apply_rotary_pos_emb, SILU -from ipex_llm.transformers.models.utils import mlp_fusion_check -import warnings - - -def pre_compute_inv_freq(module: torch.nn.Module): - if module.__class__.__name__ == "RotaryEmbedding": - inv_freq = module.inv_freq - del module.inv_freq - module.register_buffer("inv_freq", inv_freq, persistent=False) - - -def baichuan_13b_rms_norm_forward(self, hidden_states): - if hidden_states.device.type == "xpu" and not (self.training or hidden_states.requires_grad): - import xe_addons - x_2d = hidden_states.reshape(-1, hidden_states.size(-1)).contiguous() - output = xe_addons.rms_norm(self.weight, x_2d, self.epsilon) - return output.reshape(hidden_states.shape) - - input_dtype = hidden_states.dtype - hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.epsilon) - return self.weight * hidden_states.to(input_dtype) - - -def baichuan_mlp_forward( - self, - x: torch.Tensor, -) -> torch.Tensor: - x_2d = x.view(-1, x.shape[-1]) - qtype = getattr(self.gate_proj, "qtype", None) - if mlp_fusion_check(x_2d, qtype, self.training) and not self.down_proj.enable_xetla: - import xe_linear - if not x_2d.is_contiguous(): - x_2d = x_2d.contiguous() - return self.down_proj(xe_linear.mlp_forward_xpu( - x_2d, self.gate_proj.weight.data, self.up_proj.weight.data, - x_2d.shape[0], x_2d.shape[1], self.gate_proj.out_len, - SILU, qtype - )) - return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) - - -def baichuan_attention_forward_7b( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, -): - bsz, q_len, _ = hidden_states.size() - device = hidden_states.device - - qkv = self.W_pack(hidden_states) - qkv = qkv.view(bsz, q_len, self.num_heads * 3, self.head_dim) - qkv = qkv.transpose(1, 2) - query_states, key_states, value_states = qkv.split([self.num_heads, - self.num_heads, - self.num_heads], dim=1) - - kv_seq_len = key_states.shape[2] - if past_key_value is not None: - kv_seq_len += past_key_value[0].shape[2] - - # IPEX-LLM OPT: fuse rope - if should_use_fuse_rope(hidden_states, position_ids, self.training): - import xe_addons - xe_addons.rotary_half_inplaced(self.rotary_emb.inv_freq, position_ids, - query_states, key_states) - else: - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, - cos, sin, position_ids, "baichuan") - query_states = query_states.to(hidden_states.dtype) - key_states = key_states.to(hidden_states.dtype) - - # IPEX-LLM OPT: kv cache and quantize kv - use_quantize_kv = use_quantize_kv_cache(self.W_pack, hidden_states) - key_states, value_states = update_past_key_value( - past_key_value, key_states, value_states, - kv_seq_len, use_quantize_kv, device - ) - past_key_value = (key_states, value_states) if use_cache else None - - if self.training: - warnings.warn("xops is not supported on Intel GPU, so just use normal implementation") - - # IPEX-LLM OPT: sdp - attn_weights = None - if not self.training and not hidden_states.requires_grad and \ - use_flash_attention(query_states, key_states, attention_mask): - attn_output = F.scaled_dot_product_attention(query_states.to(dtype=torch.float16), - key_states.to(dtype=torch.float16), - value_states.to(dtype=torch.float16), - is_causal=True).to(hidden_states.dtype) - elif use_sdp(q_len, kv_seq_len, self.head_dim, query_states): - import xe_addons - if use_quantize_kv: - attn_output = xe_addons.sdp_fp8(query_states, key_states, value_states, - attention_mask) - else: - attn_output = xe_addons.sdp(query_states, key_states, value_states, - attention_mask) - elif use_sdp_causal(q_len, kv_seq_len, self.head_dim, query_states, self.training): - import xe_addons - if use_quantize_kv: - attn_output = xe_addons.sdp_fp8_causal(query_states, key_states, - value_states, attention_mask) - else: - attn_output = xe_addons.sdp_causal(query_states, key_states, - value_states, attention_mask) - else: - if use_quantize_kv: - key_states, value_states = restore_fp8_kv_cache(key_states, value_states, - query_states.dtype) - attn_weights = torch.matmul(query_states, - key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - if attention_mask is not None: - attn_weights = attn_weights + attention_mask - # upcast attention to fp32 - attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1, - dtype=torch.float32).to(value_states.dtype) - attn_output = torch.matmul(attn_weights, value_states) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value - - -def baichuan_attention_forward_13b( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - bsz, q_len, _ = hidden_states.size() - device = hidden_states.device - - qkv = self.W_pack(hidden_states) - qkv = qkv.view(bsz, q_len, self.num_heads * 3, self.head_dim) - qkv = qkv.transpose(1, 2) - query_states, key_states, value_states = qkv.split([self.num_heads, - self.num_heads, - self.num_heads], dim=1) - - kv_seq_len = key_states.shape[2] - if past_key_value is not None: - kv_seq_len += past_key_value[0].shape[2] - - # IPEX-LLM OPT: kv cache and quantize kv - use_quantize_kv = use_quantize_kv_cache(self.W_pack, hidden_states) - key_states, value_states = update_past_key_value( - past_key_value, key_states, value_states, - kv_seq_len, use_quantize_kv, device - ) - past_key_value = (key_states, value_states) if use_cache else None - - if self.training: - warnings.warn("xops is not supported on Intel GPU, so just use normal implementation") - - if attention_mask is not None: - if len(attention_mask.size()) == 4: - attention_mask = attention_mask[:, :, -q_len:, :] - else: - attention_mask = attention_mask[:, None, -q_len:, :] - - if use_quantize_kv and q_len == 1: - import xe_addons - attn_weights = xe_addons.query_key_fp8_matmul(query_states, key_states) - else: - if use_quantize_kv: - key_states, value_states = restore_fp8_kv_cache(key_states, value_states, - query_states.dtype) - attn_weights = torch.matmul(query_states, - key_states.transpose(2, 3)) - attn_weights = attn_weights / math.sqrt(self.head_dim) - if attention_mask is not None: - attn_weights = attn_weights + attention_mask - attn_weights = attn_weights.to(query_states.dtype) - attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1) - if use_quantize_kv and q_len == 1: - import xe_addons - attn_output = xe_addons.attn_value_fp8_matmul(attn_weights, value_states) - else: - attn_output = torch.matmul(attn_weights.to(dtype=value_states.dtype), value_states) - attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value - - -def _get_interleave(n): - def _get_interleave_power_of_2(n): - start = 2 ** (-(2 ** -(math.log2(n) - 3))) - ratio = start - return [start * ratio**i for i in range(n)] - - if math.log2(n).is_integer(): - return _get_interleave_power_of_2(n) - else: - closest_power_of_2 = 2 ** math.floor(math.log2(n)) - return ( - _get_interleave_power_of_2(closest_power_of_2) - + _get_interleave(2 * closest_power_of_2)[0::2][: n - closest_power_of_2] - ) - - -def _fill_with_neg_inf(t): - """FP16-compatible function that fills a tensor with -inf.""" - return t.float().fill_(float("-inf")).type_as(t) - - -def _buffered_future_mask(tensor, maxpos, alibi, attn_heads): - _future_mask = torch.triu(_fill_with_neg_inf(torch.zeros([maxpos, maxpos])), 1) - _future_mask = _future_mask.unsqueeze(0) + alibi - new_future_mask = _future_mask.to(tensor) - return new_future_mask[: tensor.shape[0] * attn_heads, :maxpos, :maxpos] - - -def baichuan_13b_gen_alibi_mask(tensor, n_head, max_pos): - slopes = torch.Tensor(_get_interleave(n_head)).to(tensor.dtype) - position_point = torch.arange(max_pos) - max_pos + 1 - position_point = position_point.unsqueeze(0).unsqueeze(0).expand(n_head, -1, -1) - diag = torch.diag(position_point[0]) - position_point = position_point - diag.unsqueeze(0).unsqueeze(0).transpose(-1, -2) - alibi = slopes.unsqueeze(1).unsqueeze(1) * position_point - alibi = alibi.view(n_head, 1, max_pos) - alibi_mask = torch.triu( - _fill_with_neg_inf(torch.zeros([max_pos, max_pos])), 1).to(tensor.dtype) - alibi_mask = alibi_mask.unsqueeze(0) + alibi - if tensor.device.type == "xpu": - alibi_mask = alibi_mask.to(tensor.device) - return alibi_mask - - -MASK_BLOCK_SIZE = 512 - - -def baichuan_13b_get_alibi_mask(self, tensor, seq_length_with_past): - if self.training: - slopes = torch.Tensor(_get_interleave(self.n_head)) - position_point = ( - torch.arange(seq_length_with_past) - seq_length_with_past + 1 - ) - position_point = ( - position_point.unsqueeze(0) - .unsqueeze(0) - .expand(self.n_head, seq_length_with_past, -1) - ) - diag = torch.diag(position_point[0]) - position_point = position_point - diag.unsqueeze(0).unsqueeze(0).transpose( - -1, -2 - ) - alibi = slopes.unsqueeze(1).unsqueeze(1) * position_point - mask = _buffered_future_mask( - tensor, seq_length_with_past, alibi, self.n_head - ) - else: - if self.first_run: - # Override the default max_cache_pos=4096 for memory considerations - self.max_cache_pos = seq_length_with_past + MASK_BLOCK_SIZE - self.first_run = False - self.register_buffer( - "future_mask", - baichuan_13b_gen_alibi_mask(tensor, self.n_head, self.max_cache_pos), - persistent=False, - ) - if seq_length_with_past > self.max_cache_pos: - # When max_cache_pos is not enough for current sequence length, - # increase by MASK_BLOCK_SIZE and recalculate future_mask. - self.max_cache_pos = seq_length_with_past + MASK_BLOCK_SIZE - self.register_buffer( - "future_mask", - baichuan_13b_gen_alibi_mask(tensor, self.n_head, self.max_cache_pos), - persistent=False, - ) - mask = self.future_mask[ - : self.n_head, :seq_length_with_past, :seq_length_with_past - ] - return mask diff --git a/python/llm/src/ipex_llm/transformers/models/chatglm4.py b/python/llm/src/ipex_llm/transformers/models/chatglm4.py new file mode 100644 index 00000000000..5c7f156b20c --- /dev/null +++ b/python/llm/src/ipex_llm/transformers/models/chatglm4.py @@ -0,0 +1,352 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This file is adapted from +# https://huggingface.co/THUDM/chatglm2-6b-32k/blob/main/configuration_chatglm.py +# + +import torch +from typing import Optional, Tuple, Union, List, Callable, Dict, Any +import torch.nn.functional as F +from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache +from ipex_llm.transformers.models.utils import use_quantize_kv_cache, apply_ipex_rotate_every_two +from ipex_llm.transformers.models.utils import use_sdp +from ipex_llm.transformers.models.chatglm2 import should_split_qkv_tensor +from ipex_llm.transformers.models.chatglm2 import split_tensor_along_last_dim +from transformers.modeling_outputs import BaseModelOutputWithPast + + +import os + +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) +KV_CACHE_ALLOC_MIN_LENGTH = 512 + + +def chatglm4_model_forward( + self, + input_ids, + position_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.BoolTensor] = None, + full_attention_mask: Optional[torch.BoolTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]]=None, + inputs_embeds: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, +) -> Union[Tuple, BaseModelOutputWithPast]: + from ipex_llm.transformers.kv import DynamicFp8Cache + use_cache = use_cache if use_cache is not None else self.config.use_cache + # if use_cache and use_quantize_kv_cache( + # self.encoder.layers[0].self_attention.query_key_value, input_ids): + # if not isinstance(past_key_values, DynamicFp8Cache): + # past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values) + return chatglm4_model_forward_internal( + self=self, + input_ids=input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + full_attention_mask=full_attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +def chatglm4_model_forward_internal( + self, + input_ids, + position_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.BoolTensor] = None, + full_attention_mask: Optional[torch.BoolTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]]=None, + inputs_embeds: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, +): + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else + self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, seq_length = input_ids.shape + + if inputs_embeds is None: + inputs_embeds = self.embedding(input_ids) + + if full_attention_mask is None: + if (attention_mask is not None and not attention_mask.all()) or\ + (past_key_values and seq_length != 1): + full_attention_mask = self.get_masks(input_ids, + past_key_values, + padding_mask=attention_mask) + + use_fuse_rope = input_ids.device.type == "xpu" + use_fuse_rope = use_fuse_rope and not self.training + + # Rotary positional embeddings + rotary_pos_emb = self.rotary_pos_emb(self.seq_length) + if position_ids is not None: + rotary_pos_emb = rotary_pos_emb[position_ids] + else: + rotary_pos_emb = rotary_pos_emb[None, :seq_length] + if use_fuse_rope: + # Repeat cos sin here, call only once for each token. + # Chatglm2's rotary embedding is similar to gptj's, is rotate_every_two. + # If put this to attension forward, it will generate too many times. + cos, sin = rotary_pos_emb.split(rotary_pos_emb.shape[-1] // 2, dim=-1) + cos = cos.squeeze(-1) + sin = sin.squeeze(-1) + cos = torch.repeat_interleave(cos[:, :, None, :], 2, 3) + sin = torch.repeat_interleave(sin[:, :, None, :], 2, 3) + rotary_pos_emb = (cos, sin) + + # Run encoder. + hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder( + inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb, + kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states + ) + if presents is not None and type(presents) is torch.Tensor: + presents = presents.split(1, dim=0) + presents = list(presents) + presents = [list(x.squeeze(0).split(1, dim=0)) for x in presents] + presents = [tuple([x.squeeze(0) for x in y]) for y in presents] + presents = tuple(presents) + + if not return_dict: + return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] + if v is not None) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=presents, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +@torch.jit.script +def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor: + # x: [b, np, sq, hn] + b, np, sq, hn = x.size(0), x.size(1), x.size(2), x.size(3) + rot_dim = rope_cache.shape[-2] * 2 + x, x_pass = x[..., :rot_dim], x[..., rot_dim:] + # truncate to support variable sizes + rope_cache = rope_cache[:, :sq] + xshaped = x.reshape(b, np, sq, rot_dim // 2, 2) + rope_cache = rope_cache.view(-1, 1, sq, xshaped.size(3), 2) + x_out2 = torch.stack( + [ + xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1], + xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1], + ], + -1, + ) + x_out2 = x_out2.flatten(3) + return torch.cat((x_out2, x_pass), dim=-1) + + +def chatglm4_attention_forward( + self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True +): + # hidden_states: [sq, b, h] + + # ================================================= + # Pre-allocate memory for key-values for inference. + # ================================================= + # ===================== + # Query, Key, and Value + # ===================== + + # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)] + device = hidden_states.device + mixed_x_layer = self.query_key_value(hidden_states) + + if self.multi_query_attention: + (query_layer, key_layer, value_layer) = mixed_x_layer.split( + [ + self.num_attention_heads_per_partition * self.hidden_size_per_attention_head, + self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head, + self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head, + ], + dim=-1, + ) + query_layer = query_layer.view( + query_layer.size()[:-1] + (self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head) + ) + key_layer = key_layer.view( + key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, + self.hidden_size_per_attention_head) + ) + value_layer = value_layer.view( + value_layer.size()[:-1] + + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head) + ) + else: + new_tensor_shape = mixed_x_layer.size()[:-1] + (self.num_attention_heads_per_partition, + 3 * self.hidden_size_per_attention_head) + mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) + + # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn] + (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3) + + # [b, sq, np, hn] -> [b, np, sq, hn] + query_layer, key_layer, value_layer = [k.transpose(1, 2) + for k in [query_layer, key_layer, value_layer]] + + # apply relative positional encoding (rotary embedding) + if isinstance(rotary_pos_emb, tuple) and len(rotary_pos_emb) == 2: + # use_fuse_rope, see chatglm4_model_forward + cos, sin = rotary_pos_emb + rot_dim = cos.shape[-1] + query_layer = query_layer.transpose(1, 2) + key_layer = key_layer.transpose(1, 2) + query_layer_cur = query_layer[..., :rot_dim] + key_layer_cur = key_layer[..., :rot_dim] + # ipex_llm's apply_rotary_embedding can change the origin storage, + # so query_layer will get the result directly. + torch.ops.torch_ipex.apply_rotary_embedding(query_layer_cur, sin, cos, query_layer_cur) + torch.ops.torch_ipex.apply_rotary_embedding(key_layer_cur, sin, cos, key_layer_cur) + query_layer = query_layer.transpose(1, 2) + key_layer = key_layer.transpose(1, 2) + elif rotary_pos_emb is not None: + query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb) + key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb) + + cur_length, batch_size = query_layer.shape[2], query_layer.shape[0] + + # adjust key and value for inference + if kv_cache is not None and use_cache: + cache_k, cache_v = kv_cache + past_length = cache_k.size(2) + + if cache_k.stride()[1] < (past_length + cur_length) * cache_k.size(3): + max_cache_length = past_length + cur_length + KV_CACHE_ALLOC_BLOCK_LENGTH + new_cache_k, new_cache_v = extend_kv_cache(batch_size, + key_layer.size(1), + self.hidden_size_per_attention_head, + past_length, + max_cache_length, + dtype=query_layer.dtype, + device=device) + new_cache_k[:] = cache_k + new_cache_v[:] = cache_v + cache_k = new_cache_k + cache_v = new_cache_v + + key_layer, value_layer = append_kv_cache(cache_k, cache_v, key_layer, value_layer) + + if use_cache: + if kv_cache is None: + kv_cache = torch.cat((key_layer.unsqueeze(0).unsqueeze(0), + value_layer.unsqueeze(0).unsqueeze(0)), dim=1) + else: + kv_cache = (key_layer, value_layer) + else: + kv_cache = None + + if self.multi_query_attention: + key_layer = key_layer.unsqueeze(2) + key_layer = key_layer.expand( + -1, -1, + self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, + -1, -1 + ) + key_layer = key_layer.contiguous().view( + key_layer.size()[:1] + (self.num_attention_heads_per_partition,) + key_layer.size()[3:] + ) + value_layer = value_layer.unsqueeze(2) + value_layer = value_layer.expand( + -1, -1, + self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, + -1, -1 + ) + value_layer = value_layer.contiguous().view( + value_layer.size()[:1] + + (self.num_attention_heads_per_partition,) + value_layer.size()[3:] + ) + + # ================================== + # core attention computation + # ================================== + + context_layer = core_attn_forward(query_layer, key_layer, value_layer, attention_mask) + + # ================= + # Output. [sq, b, h] + # ================= + + output = self.dense(context_layer) + + return output, kv_cache + + +def core_attn_forward(query_layer, key_layer, value_layer, attention_mask): + L, S = query_layer.shape[2], key_layer.shape[2] + if attention_mask is None and L == S: + batch_size, n_head, seq_len, head_dim = query_layer.shape + if should_split_qkv_tensor(query_layer, batch_size, n_head, seq_len): + # split second dim to block size = 8 + block_size = 8 + query_split = torch.split(query_layer.to(key_layer.dtype), block_size, dim=1) + key_split = torch.split(key_layer, block_size, dim=1) + value_split = torch.split(value_layer, block_size, dim=1) + results = [] + for q, k, v in zip(query_split, key_split, value_split): + result = F.scaled_dot_product_attention(q, k, v, is_causal=True).to(k.dtype) + results.append(result) + context_layer = torch.cat(results, dim=1) + else: + context_layer = F.scaled_dot_product_attention(query_layer.to(key_layer.dtype), + key_layer, + value_layer, + is_causal=True).to(key_layer.dtype) + else: + # attention_mask is not None only when past_key_value is not None and q_len > 1 + if attention_mask is not None: + attn_bias = torch.zeros(attention_mask.shape, dtype=query_layer.dtype, + device=query_layer.device) + attention_mask = ~attention_mask + if attention_mask.dtype == torch.bool: + attn_bias.masked_fill_(attention_mask.logical_not(), float("-inf")) + else: + attn_bias += attention_mask + else: + attn_bias = None + + if use_sdp(query_layer.shape[2], key_layer.shape[2], + query_layer.shape[-1], query_layer): + import xe_addons + attn_output = xe_addons.sdp(query_layer, key_layer, value_layer, attn_bias) + context_layer = attn_output.view(query_layer.shape) + else: + head_dim = query_layer.size(-1) + attn = torch.matmul(query_layer.to(key_layer.dtype), + key_layer.transpose(2, 3)) / math.sqrt(head_dim) + if attn_bias is not None: + attn += attn_bias + attn = F.softmax(attn, dim=-1, + dtype=torch.float32).to(value_layer.dtype) + context_layer = torch.matmul(attn, value_layer) + context_layer = context_layer.transpose(1, 2).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (-1,) + context_layer = context_layer.reshape(*new_context_layer_shape) + + return context_layer diff --git a/python/llm/src/ipex_llm/transformers/models/cohere.py b/python/llm/src/ipex_llm/transformers/models/cohere.py index 9ee4f142715..5e3437e3a42 100644 --- a/python/llm/src/ipex_llm/transformers/models/cohere.py +++ b/python/llm/src/ipex_llm/transformers/models/cohere.py @@ -54,7 +54,7 @@ from transformers.models.cohere.modeling_cohere import apply_rotary_pos_emb from ipex_llm.transformers.models.utils import use_quantize_kv_cache, restore_fp8_kv_cache from ipex_llm.transformers.kv import DynamicFp8Cache -from ipex_llm.transformers.models.qwen2 import should_use_fuse_rope +from ipex_llm.transformers.models.utils import should_use_fuse_rope from transformers.modeling_outputs import BaseModelOutputWithPast from ipex_llm.utils.common import invalidInputError try: @@ -313,7 +313,7 @@ def cohere_attention_forward_origin( ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() device = hidden_states.device - use_fuse_rope = should_use_fuse_rope(self, hidden_states, position_ids) + use_fuse_rope = should_use_fuse_rope(hidden_states, position_ids, self.training) enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value, self.layer_idx) decoding_fast_path = use_decoding_fast_path(self.q_proj, use_fuse_rope, diff --git a/python/llm/src/ipex_llm/transformers/models/gptbigcode.py b/python/llm/src/ipex_llm/transformers/models/gptbigcode.py index 611b9fba6a4..747cc26d56f 100644 --- a/python/llm/src/ipex_llm/transformers/models/gptbigcode.py +++ b/python/llm/src/ipex_llm/transformers/models/gptbigcode.py @@ -99,3 +99,99 @@ def gptbigcode_attention_forward( outputs += (attn_weights,) return outputs + + +def gptbigcode_sdpa_attention_forward( + self, + hidden_states: torch.Tensor, + layer_past: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, +) -> Union[ + Tuple[torch.Tensor, Optional[torch.Tensor]], + Tuple[torch.Tensor, Optional[torch.Tensor], Tuple[torch.Tensor, ...]], +]: + if encoder_hidden_states is not None: + if not hasattr(self, "q_attn") or not self.is_cross_attention: + from ipex_llm.utils.common import invalidInputError + invalidInputError( + False, + "If class is used as cross attention," + + "the weights `q_attn` have to be defined. " + + "Please make sure to instantiate class with " + + "`GPTBigCodeAttention(..., is_cross_attention=True)`." + ) + + query = self.q_attn(hidden_states) + key_value = self.c_attn(encoder_hidden_states) + attention_mask = encoder_attention_mask + elif self.multi_query: + query, key_value = self.c_attn(hidden_states).split( + (self.embed_dim, 2 * self.kv_dim), dim=2) + else: + # Note: We split as (self.num_heads, 3, self.head_dim) instead of + # (3, self.num_heads, self.head_dim), + # i.e., the memory layout is not the same as GPT2. + # This makes the concatenation with past_key_value more efficient. + query, key_value = ( + self.c_attn(hidden_states) + .view(*hidden_states.shape[:2], self.num_heads, 3 * self.head_dim) + .transpose(1, 2) + .split((self.head_dim, 2 * self.head_dim), dim=3) + ) + + if layer_past is not None: + if layer_past.shape[-2] == key_value.shape[-2]: + key_value = torch.cat((layer_past, key_value), dim=-2) + else: + fill_zeros = torch.zeros(layer_past.shape[0], + layer_past.shape[1], + key_value.shape[2] - layer_past.shape[2], + dtype=layer_past.dtype, + device=layer_past.device) + layer_past = torch.cat([layer_past, fill_zeros], dim=-1) + key_value = torch.cat((layer_past, key_value), dim=-2) + # key_value = torch.cat((layer_past, key_value), dim=-2) + present = key_value if use_cache else None + + key, value = key_value.split((self.head_dim, self.head_dim), dim=-1) + + if not output_attentions and head_mask is None: + # Difference with the original implementation: there is no need to + # transpose the key here, + # as SDPA expects seq_length to be at index -2 for the key as well + attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask) + else: + # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` + # once this is implemented. + logger.warning_once( + "GPTBigCodeModel is using GPTBigCodeSdpaAttention, " + "but `torch.nn.functional.scaled_dot_product_attention` does not support " + "`output_attentions=True` and `head_mask` not None." + ' Falling back to the manual attention implementation, ' + 'but specifying the manual implementation will be required from ' + 'Transformers version v5.0.0 onwards. ' + 'This warning can be removed using the argument `attn_implementation="eager"` ' + 'when loading the model.' + ) + attn_output, attn_weights = super()._attn(query, key.transpose(-1, -2), + value, attention_mask, head_mask) + + if not self.multi_query: + attn_output = attn_output.transpose(1, 2).reshape(hidden_states.shape) + attn_output = self.c_proj(attn_output) + attn_output = self.resid_dropout(attn_output) + + outputs = (attn_output, present) + if output_attentions: + if self.multi_query: + # Transpose to return weights in the usual format + # (batch_size, num_heads, query_length, key_length) + attn_weights = attn_weights.transpose(1, 2) + outputs += (attn_weights,) + + return outputs diff --git a/python/llm/src/ipex_llm/transformers/models/minicpm.py b/python/llm/src/ipex_llm/transformers/models/minicpm.py new file mode 100644 index 00000000000..baa38d97152 --- /dev/null +++ b/python/llm/src/ipex_llm/transformers/models/minicpm.py @@ -0,0 +1,771 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Some parts of this file is adapted from +# https://github.com/huggingface/transformers/blob/v4.31.0/src/transformers/models/llama/modeling_llama.py +# which is licensed under Apache License 2.0: +# +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import warnings +import importlib +import torch.nn as nn +from typing import Optional, Tuple, Union, List +import math +import os +import torch.nn.functional as F +from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache +from ipex_llm.transformers.models.utils import SILU +from ipex_llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \ + restore_fp8_kv_cache, use_quantize_kv_cache +from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_31, \ + apply_rotary_pos_emb, is_enough_kv_cache_room_4_36 +from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_no_cache_xpu +from ipex_llm.transformers.models.utils import use_flash_attention, use_sdp, use_sdp_fp8 +from ipex_llm.transformers.models.utils import mlp_fusion_check, fp16_fusion_check +from ipex_llm.transformers.models.utils import use_decoding_fast_path +from transformers.modeling_outputs import BaseModelOutputWithPast +from transformers.models.llama.modeling_llama import LlamaModel +from ipex_llm.transformers.low_bit_linear import SYM_INT4, FP8E5, IQ2_XXS, FP4 +from ipex_llm.ggml.quantize import ggml_tensor_qtype +from ipex_llm.utils.common import invalidInputError +from ipex_llm.transformers.models.llama import should_use_fuse_rope, should_use_xetla_mm_qkv +from ipex_llm.transformers.models.llama import fuse_qkv_weight_xetla, repeat_kv, native_sdp +from ipex_llm.transformers.models.llama import llama_decoding_fast_path_qtype_check +from ipex_llm.transformers.models.llama import should_split_qkv_tensor, should_split_qkv_tensor + +try: + from transformers.cache_utils import Cache, DynamicCache +except ImportError: + Cache = Tuple[torch.Tensor] +from transformers import logging +KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) + + +def minicpm_attention_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[List[torch.FloatTensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[List[torch.FloatTensor]]]: + if use_quantize_kv_cache(self.q_proj, hidden_states): + forward_function = minicpm_attention_forward_quantized + else: + forward_function = minicpm_attention_forward_original + return forward_function( + self=self, + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + kwargs=kwargs + ) + + +def minicpm_attention_forward_original( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[List[torch.FloatTensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[List[torch.FloatTensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. " + "Please make sure use `attention_mask` instead.`" + ) + + bsz, q_len, hidden_size = hidden_states.size() + device = hidden_states.device + # for flash attention + original_dtype = hidden_states.dtype + + use_fuse_rope = should_use_fuse_rope(self, hidden_states, position_ids) + enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value, self.layer_idx, seq_len=q_len) + no_tp = not self.config.pretraining_tp > 1 + decoding_fast_path = use_decoding_fast_path(self.q_proj, + use_fuse_rope, + enough_kv_room, + bsz * q_len, + llama_decoding_fast_path_qtype_check) and no_tp + + # single batch decoding fast path + # forward_qkv takes will perform QKV projection, rotary position embedding + # and save the key/value states to cache, then return query states and the + # extended key/value cache + if decoding_fast_path: + hidden_states = hidden_states.view(1, -1) + cache_k = past_key_value.key_cache[self.layer_idx] + cache_v = past_key_value.value_cache[self.layer_idx] + kv_seq_len = cache_k.shape[-2] + import xe_linear + query_states, key_states, value_states = xe_linear.forward_qkv(hidden_states, + self.q_proj.weight, + self.k_proj.weight, + self.v_proj.weight, + position_ids, + cache_k, cache_v, + self.q_proj.weight.qtype, + self.v_proj.weight.qtype, + kv_seq_len, + self.head_dim, + self.rotary_emb.base,) + kv_seq_len += 1 + # update past_key_value's seem_tokens and kv caches. + if self.layer_idx == 0: + past_key_value.seen_tokens = kv_seq_len + past_key_value.key_cache[self.layer_idx] = key_states + past_key_value.value_cache[self.layer_idx] = value_states + + else: + if self.config.pretraining_tp > 1: + key_value_slicing = ((self.num_key_value_heads * self.head_dim) // + self.config.pretraining_tp) + query_slices = self.q_proj.weight.split((self.num_heads * self.head_dim) + // self.config.pretraining_tp, dim=0) + key_slices = self.k_proj.weight.split(key_value_slicing, dim=0) + value_slices = self.v_proj.weight.split(key_value_slicing, dim=0) + + query_states = [F.linear(hidden_states, query_slices[i]) + for i in range(self.config.pretraining_tp)] + query_states = torch.cat(query_states, dim=-1) + + key_states = [F.linear(hidden_states, key_slices[i]) + for i in range(self.config.pretraining_tp)] + key_states = torch.cat(key_states, dim=-1) + + value_states = [F.linear(hidden_states, value_slices[i]) + for i in range(self.config.pretraining_tp)] + value_states = torch.cat(value_states, dim=-1) + else: + if fp16_fusion_check(self.q_proj, hidden_states, self.training) and \ + hidden_size == 4096 and self.q_proj.out_features == self.k_proj.out_features: + # only use mm_qkv_out on pvc for llama-7b + if not hasattr(self, "qkv_proj_weight"): + self.qkv_proj_weight = torch.stack([self.q_proj.weight, + self.k_proj.weight, + self.v_proj.weight]).contiguous() + self.q_proj.weight.data = self.qkv_proj_weight[0, :, :] + self.k_proj.weight.data = self.qkv_proj_weight[1, :, :] + self.v_proj.weight.data = self.qkv_proj_weight[2, :, :] + torch.xpu.empty_cache() + query_states = torch.empty(bsz, q_len, self.qkv_proj_weight.shape[-1], + dtype=hidden_states.dtype, device=hidden_states.device) + key_states = torch.empty(bsz, q_len, self.qkv_proj_weight.shape[-1], + dtype=hidden_states.dtype, device=hidden_states.device) + value_states = torch.empty(bsz, q_len, self.qkv_proj_weight.shape[-1], + dtype=hidden_states.dtype, device=hidden_states.device) + torch.ops.torch_ipex.mm_qkv_out( + hidden_states, self.qkv_proj_weight, None, + query_states, key_states, value_states + ) + else: + if should_use_xetla_mm_qkv(self, device): + if not hasattr(self, "qkv_proj_qweight"): + self.qkv_proj_qweight = fuse_qkv_weight_xetla(self.q_proj, + self.k_proj, + self.v_proj, + self.q_proj.weight.qtype,) + import xe_linear + q_out_len = self.q_proj.out_len + k_out_len = self.k_proj.out_len + v_out_len = self.v_proj.out_len + qkv_states = xe_linear.mm_xetla(hidden_states, + self.qkv_proj_qweight, + self.q_proj.weight.qtype) + query_states = qkv_states[:, :, :q_out_len] + key_states = qkv_states[:, :, q_out_len:q_out_len + k_out_len] + value_states = qkv_states[:, :, q_out_len + k_out_len:] + else: + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, + self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, + self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, + self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + invalidInputError(False, + "The cache structure has changed since version v4.36. " + f"If you are using {self.__class__.__name__} for " + "auto-regressive decodingwith k/v caching, please make sure " + "to initialize the attention class with a layer index.") + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + + if use_fuse_rope: + rope_theta = self.rotary_emb.base + query_states, key_states = apply_rotary_pos_emb_no_cache_xpu(query_states, + key_states, + position_ids, + "llama", + rope_theta=rope_theta) + else: + if cache_position is not None: + # for transformers 4.38.0 + cos, sin = self.rotary_emb(value_states, position_ids) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, + cos, sin, position_ids, "llama2") + else: + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, + cos, sin, position_ids, "llama") + + if past_key_value is not None: + # update the number of seen tokens + if self.layer_idx == 0: + past_key_value.seen_tokens += key_states.shape[-2] + + # reuse k, v, self_attention + # update `past_key_value` with `key_states` and `value_states` for layer `layer_idx` + if len(past_key_value.key_cache) <= self.layer_idx: + past_key_value.key_cache.append(key_states) + past_key_value.value_cache.append(value_states) + else: + cache_k = past_key_value.key_cache[self.layer_idx] + cache_v = past_key_value.value_cache[self.layer_idx] + + if not enough_kv_room: + # allocate new + new_c_k, new_c_v = extend_kv_cache(bsz, + self.num_key_value_heads, # Support GQA + self.head_dim, + cache_k.size(2), + kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH, + dtype=cache_k.dtype, + device=device) + + new_c_k[:] = cache_k + new_c_v[:] = cache_v + cache_k = new_c_k + cache_v = new_c_v + + key_states, value_states = append_kv_cache(cache_k, + cache_v, + key_states, + value_states) + + # update past_key_value + past_key_value.key_cache[self.layer_idx] = key_states + past_key_value.value_cache[self.layer_idx] = value_states + + if cache_position is not None: + new_attention_mask = attention_mask[:, :, kv_seq_len - q_len:kv_seq_len, 0:kv_seq_len] + else: + new_attention_mask = attention_mask + + if not self.training and not hidden_states.requires_grad and \ + use_flash_attention(query_states, key_states, new_attention_mask): + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + # now only use flash attention for first token + attn_output = F.scaled_dot_product_attention(query_states.to(device, dtype=torch.float16), + key_states.to(device, dtype=torch.float16), + value_states.to(device, dtype=torch.float16), + is_causal=True) + attn_weights = None + elif not self.training and not hidden_states.requires_grad and \ + self.layer_idx > 0 and \ + use_sdp(q_len, key_states.shape[2], self.head_dim, query_states): + import xe_addons + attn_output = xe_addons.sdp(query_states, key_states, value_states, + new_attention_mask) + attn_output = attn_output.view(query_states.shape) + attn_weights = None + else: + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + # otherwise, use native attention + if query_states.device.type == "xpu": + attn_output, attn_weights = native_sdp(query_states, key_states, value_states, + new_attention_mask, cache_position, + bsz, q_len, kv_seq_len, + self.head_dim, self.num_heads, output_attentions) + else: + # CPU path + if not output_attentions: + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=new_attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + # The q_len > 1 is necessary to match with + # AttentionMaskConverter.to_causal_4d that + # does not create a causal mask in case q_len == 1. + is_causal=self.is_causal and new_attention_mask is None and q_len > 1, + ) + else: + attn_output, attn_weights = native_sdp(query_states, key_states, value_states, + new_attention_mask, cache_position, + bsz, q_len, kv_seq_len, + self.head_dim, + self.num_heads, output_attentions) + + attn_output_size = (bsz, self.num_heads, q_len, self.head_dim) + if attn_output.size() != attn_output_size: + invalidInputError(False, + f"`attn_output` should be of size {attn_output_size}," + f" but is {attn_output.size()}") + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + if self.config.pretraining_tp > 1: + attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2) + o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, + dim=1) + attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) + for i in range(self.config.pretraining_tp)]) + else: + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output.to(original_dtype), attn_weights, past_key_value + + +def minicpm_attention_forward_quantized( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[List[torch.FloatTensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[List[torch.FloatTensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. " + "Please make sure use `attention_mask` instead.`" + ) + + bsz, q_len, _ = hidden_states.size() + device = hidden_states.device + use_fuse_rope = should_use_fuse_rope(self, hidden_states, position_ids) + enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value, self.layer_idx, seq_len=q_len) + no_tp = not self.config.pretraining_tp > 1 + decoding_fast_path = use_decoding_fast_path(self.q_proj, + use_fuse_rope, + enough_kv_room, + bsz * q_len, + llama_decoding_fast_path_qtype_check) and no_tp + if decoding_fast_path: + hidden_states = hidden_states.view(1, -1) + tmp_cache_k, tmp_cache_v = init_kv_cache( + bsz, + self.num_key_value_heads, + self.head_dim, + 0, + 1, + dtype=hidden_states.dtype, + device=device + ) + import xe_linear + query_states, key_states, value_states = xe_linear.forward_qkv(hidden_states, + self.q_proj.weight, + self.k_proj.weight, + self.v_proj.weight, + position_ids, + tmp_cache_k, tmp_cache_v, + self.q_proj.weight.qtype, + self.v_proj.weight.qtype, + 0, + self.head_dim, + self.rotary_emb.base,) + else: + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, + self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, + self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, + self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + invalidInputError( + False, + f"The cache structure has changed since version v4.36." + f" If you are using {self.__class__.__name__} " + f"for auto-regressive decoding with k/v caching," + f" please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + if use_fuse_rope: + rope_theta = self.rotary_emb.base + query_states, key_states = apply_rotary_pos_emb_no_cache_xpu(query_states, + key_states, + position_ids, + "llama", + rope_theta=rope_theta) + else: + if cache_position is not None: + # for transformers 4.38.0 + cos, sin = self.rotary_emb(value_states, position_ids) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, + cos, sin, position_ids, "llama2") + else: + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, + cos, sin, position_ids, "llama") + kv_seq_len = key_states.shape[-2] + + if len(past_key_value.key_cache) <= self.layer_idx: + repeated_key_states = repeat_kv(key_states, self.num_key_value_groups) + repeated_value_states = repeat_kv(value_states, self.num_key_value_groups) + if should_split_qkv_tensor(query_states, bsz, self.num_heads, + q_len, kv_seq_len, output_attentions): + attn_output, _ = native_sdp_split_qkv_tensor(query_states, repeated_key_states, + repeated_value_states, + attention_mask, cache_position, + bsz, q_len, kv_seq_len, self.head_dim, + self.num_heads) + else: + attn_weights = torch.matmul(query_states, repeated_key_states + .transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + invalidInputError( + False, + f"Attention weights should be of size " + f"{(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if cache_position is not None: + # for transformers 4.38.0 + causal_mask = attention_mask[:, :, cache_position, : kv_seq_len] + attn_weights = attn_weights + causal_mask + else: + attn_mask_size = (bsz, 1, q_len, kv_seq_len) + if attention_mask.size() != attn_mask_size: + invalidInputError(False, + f"Attention mask should be of size {attn_mask_size}, " + f"but is {attention_mask.size()}") + attn_weights = attn_weights + attention_mask + + if kv_seq_len >= 2048 or bsz >= 64: + # for memory considerations, do not upcast attention to fp32 + # for long sequences or large batches + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + else: + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, + dtype=torch.float32).to(query_states.dtype) + attn_output = torch.matmul(attn_weights, repeated_value_states) + if use_cache: + cache_kwargs = None + key_states, value_states = past_key_value.update(key_states, value_states, + self.layer_idx, cache_kwargs) + else: + cache_kwargs = None # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, + self.layer_idx, cache_kwargs) + kv_seq_len = key_states.shape[-2] + if not use_sdp_fp8(q_len, key_states.shape[2], query_states): + key_states, value_states = restore_fp8_kv_cache(key_states, value_states, + query_states.dtype) + key_states = repeat_kv(key_states, self.num_key_value_groups)\ + .to(device, dtype=query_states.dtype) + value_states = repeat_kv(value_states, self.num_key_value_groups)\ + .to(device, dtype=query_states.dtype) + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) + attn_weights = attn_weights / math.sqrt(self.head_dim) + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + invalidInputError( + False, + f"Attention weights should be of size" + f" {(bsz, self.num_heads, q_len, kv_seq_len)}," + f" but is {attn_weights.size()}" + ) + + if attention_mask is not None: + if cache_position is not None: + # for transformers 4.38.0 + causal_mask = attention_mask[:, :, cache_position, : kv_seq_len] + attn_weights = attn_weights + causal_mask + else: + attn_mask_size = (bsz, 1, q_len, kv_seq_len) + if attention_mask.size() != attn_mask_size: + invalidInputError(False, + f"Attention mask should be of size {attn_mask_size}, " + f"but is {attention_mask.size()}") + attn_weights = attn_weights + attention_mask + + if kv_seq_len >= 2048 or bsz >= 64: + # for memory considerations, do not upcast attention to fp32 + # for long sequences or large batches + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + else: + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, + dtype=torch.float32).to(query_states.dtype) + attn_output = torch.matmul(attn_weights, value_states) + else: + import xe_addons + if cache_position is not None: + new_attn_mask = attention_mask[:, :, kv_seq_len-q_len:kv_seq_len, 0:kv_seq_len] + else: + new_attn_mask = attention_mask + attn_output = xe_addons.sdp_fp8(query_states, key_states, value_states, new_attn_mask) + attn_weights = None + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + invalidInputError( + False, + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}," + f" but is {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + if self.config.pretraining_tp > 1: + attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2) + o_proj_slices = self.o_proj.weight.split(self.hidden_size + // self.config.pretraining_tp, dim=1) + attn_output = sum([F.linear(attn_output[i], + o_proj_slices[i]) for i in range(self.config.pretraining_tp)]) + else: + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +def minicpm_model_forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, +) -> Union[Tuple, BaseModelOutputWithPast]: + from ipex_llm.transformers.kv import DynamicFp8Cache + use_cache = use_cache if use_cache is not None else self.config.use_cache + input = input_ids if input_ids is not None else inputs_embeds + if use_cache and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input): + if not isinstance(past_key_values, DynamicFp8Cache): + past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values) + return minicpm_model_forward_internal( + self=self, + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +def minicpm_model_forward_internal( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, +) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None \ + else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + invalidInputError(False, + "You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape[:2] + elif inputs_embeds is not None: + batch_size, seq_length = inputs_embeds.shape[:2] + else: + invalidInputError(False, + "You have to specify either input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing." + " Setting `use_cache=False`..." + ) + use_cache = False + + past_key_values_length = 0 + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, + dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) * self.config.scale_emb + + if self._use_flash_attention_2: + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask)\ + else None + elif self._use_sdpa and not output_attentions: + # output_attentions=True can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask_for_sdpa + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + else: + # 4d mask is passed through the layers + from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length + ) + + # embed positions + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + ) + else: + # bigdl-llm changes: + curr_device = decoder_layer.input_layernorm.weight.device + if attention_mask is not None: + attention_mask = attention_mask.to(curr_device) + if position_ids is not None: + position_ids = position_ids.to(curr_device) + # bigdl-llm changes end + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache \ + else next_decoder_cache + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] + if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) diff --git a/python/llm/src/ipex_llm/transformers/models/qwen2.py b/python/llm/src/ipex_llm/transformers/models/qwen2.py index 4a16df4be83..2e236acb1d6 100644 --- a/python/llm/src/ipex_llm/transformers/models/qwen2.py +++ b/python/llm/src/ipex_llm/transformers/models/qwen2.py @@ -38,8 +38,7 @@ # import math -import warnings -from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List +from typing import Optional, Tuple, Union, List import torch from torch.nn.functional import scaled_dot_product_attention as sdpa @@ -74,7 +73,10 @@ def qwen2_model_forward( return_dict: Optional[bool] = None, ): use_cache = use_cache if use_cache is not None else self.config.use_cache - use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.up_proj, input_ids) + use_quantize_kv = ( + self.config.hidden_size != 3584 # disable quantize kv in specific model + and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input_ids) + ) if use_cache: if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache): past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values) @@ -324,6 +326,9 @@ def qwen2_attention_forward( attn_weights = None if query_states.device.type == "cpu": + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) attn_output = sdpa(query_states, key_states, value_states, @@ -332,6 +337,9 @@ def qwen2_attention_forward( is_causal=self.is_causal and attention_mask is None and q_len > 1) elif not self.training and not hidden_states.requires_grad and \ use_flash_attention(query_states, key_states, attention_mask): + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) attn_output = sdpa(query_states.to(device, dtype=torch.float16), key_states.to(device, dtype=torch.float16), value_states.to(device, dtype=torch.float16), diff --git a/python/llm/src/ipex_llm/transformers/models/qwen2_moe.py b/python/llm/src/ipex_llm/transformers/models/qwen2_moe.py index 9f14ca086c9..be159316029 100644 --- a/python/llm/src/ipex_llm/transformers/models/qwen2_moe.py +++ b/python/llm/src/ipex_llm/transformers/models/qwen2_moe.py @@ -37,39 +37,20 @@ # limitations under the License. """ PyTorch Qwen2MoE model.""" -import math import torch import torch.nn.functional as F -import torch.nn as nn import torch.utils.checkpoint -import warnings -from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List -from ipex_llm.transformers.models.llama import repeat_kv -from ipex_llm.transformers.models.qwen2 import should_use_fuse_rope -from ipex_llm.transformers.models.utils import extend_kv_cache, append_kv_cache -from ipex_llm.transformers.models.utils import apply_rotary_pos_emb_cache_freq_xpu -from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_36 -from transformers.models.qwen2.modeling_qwen2 import apply_rotary_pos_emb +from typing import Optional, Tuple, Union, List from ipex_llm.utils.common import invalidInputError -from ipex_llm.transformers.models.utils import decoding_fast_path_qtype_check -from ipex_llm.transformers.models.utils import use_flash_attention -from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeModel, apply_rotary_pos_emb -from ipex_llm.transformers.models.utils import use_quantize_kv_cache, restore_fp8_kv_cache -from ipex_llm.transformers.kv import DynamicFp8Cache +from ipex_llm.transformers.models.utils import use_quantize_kv_cache +from ipex_llm.transformers.kv import DynamicFp8Cache, DynamicNormalCache -import os - -KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) - -from transformers.models.qwen2.modeling_qwen2 import _prepare_4d_causal_attention_mask_for_sdpa -from transformers.models.qwen2.modeling_qwen2 import _prepare_4d_causal_attention_mask +from transformers.models.qwen2_moe.modeling_qwen2_moe import ( + _prepare_4d_causal_attention_mask_for_sdpa, _prepare_4d_causal_attention_mask, + Qwen2MoeAttention, +) from transformers.modeling_outputs import MoeModelOutputWithPast - -try: - from transformers.cache_utils import Cache, DynamicCache -except ImportError: - Cache = Tuple[torch.Tensor] -import logging +from transformers.cache_utils import Cache, DynamicCache from transformers import logging @@ -90,9 +71,12 @@ def qwen2moe_model_forward( return_dict: Optional[bool] = None, ): use_cache = use_cache if use_cache is not None else self.config.use_cache - if use_cache and use_quantize_kv_cache(self.layers[0].mlp.shared_expert.up_proj, input_ids): - if not isinstance(past_key_values, DynamicFp8Cache): + use_quantize_kv = use_quantize_kv_cache(self.layers[0].mlp.shared_expert.up_proj, input_ids) + if use_cache: + if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache): past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values) + if not use_quantize_kv and not isinstance(past_key_values, DynamicNormalCache): + past_key_values = DynamicNormalCache.from_legacy_cache(past_key_values) return qwen2_moe_model_forward_internal( self=self, input_ids=input_ids, @@ -290,452 +274,27 @@ def qwen2_moe_model_forward_internal( ) -def qwen2moe_attention_forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if use_quantize_kv_cache(self.q_proj, hidden_states): - forward_function = qwen2moe_attention_forward_quantized - elif hidden_states.device.type == "cpu": - forward_function = qwen2moe_attention_forward_sdpa - else: - forward_function = qwen2moe_attention_forward_origin - return forward_function( - self=self, - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - **kwargs, - ) - - -def qwen2moe_attention_forward_quantized( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37." - "Please make sure use `attention_mask` instead.`" - ) - use_fuse_rope = should_use_fuse_rope(self, hidden_states, position_ids) - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.view(bsz, q_len, - self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, - self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, - self.num_key_value_heads, self.head_dim).transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - invalidInputError(self.layer_idx is not None, - "The cache structure has changed since version v4.36. " - f"If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, " - "please make sure to initialize the attention class " - "with a layer index.") - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - - if use_fuse_rope: - query_states, key_states = apply_rotary_pos_emb_cache_freq_xpu(query_states, key_states, - sin, cos, "qwen2_moe", - position_ids) - else: - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, - cos, sin, position_ids) - - if past_key_value is not None: - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, - self.layer_idx, cache_kwargs) - if q_len == 1 and query_states.device.type == 'xpu' and not self.training \ - and not hidden_states.requires_grad: - import xe_addons - attn_weights = xe_addons.query_key_fp8_matmul(query_states, key_states) - else: - key_states, value_states = restore_fp8_kv_cache(key_states, - value_states, query_states.dtype) - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) - - attn_weights = attn_weights / math.sqrt(self.head_dim) - - invalidInputError(attn_weights.size() == (bsz, self.num_heads, q_len, kv_seq_len), - ("Attention weights should be of size " - f"{(bsz, self.num_heads, q_len, kv_seq_len)}," - "but is {attn_weights.size()}")) - - if attention_mask is not None: - invalidInputError(attention_mask.size() == (bsz, 1, q_len, kv_seq_len), - (f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}" - f" but is {attention_mask.size()}")) - - attn_weights = attn_weights + attention_mask - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, - dtype=torch.float32).to(query_states.dtype) - attn_weights = nn.functional.dropout(attn_weights, - p=self.attention_dropout, training=self.training) - if q_len == 1 and query_states.device.type == 'xpu' and not self.training \ - and not hidden_states.requires_grad: - import xe_addons - attn_output = xe_addons.attn_value_fp8_matmul(attn_weights, value_states) - else: - attn_output = torch.matmul(attn_weights, value_states) - - invalidInputError(attn_output.size() == (bsz, self.num_heads, q_len, self.head_dim), - "`attn_output` should be of size " - f"{(bsz, self.num_heads, q_len, self.head_dim)}," - f" but is {attn_output.size()}") - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value - - -def qwen2moe_attention_forward_origin( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - use_fuse_rope = should_use_fuse_rope(self, hidden_states, position_ids) - - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. " - "Please make sure use `attention_mask` instead.`" - ) - bsz, q_len, _ = hidden_states.size() - device = hidden_states.device - - enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value, self.layer_idx) - - qtype_check = decoding_fast_path_qtype_check(self.q_proj) - decoding_fast_path = (qtype_check and use_fuse_rope - and enough_kv_room and bsz * q_len == 1) - decoding_fast_path = decoding_fast_path and not self.q_proj.enable_xetla - if decoding_fast_path: - hidden_states = hidden_states.view(1, -1) - cache_k = past_key_value.key_cache[self.layer_idx] - cache_v = past_key_value.value_cache[self.layer_idx] - kv_seq_len = cache_k.shape[-2] - import xe_linear - args = [hidden_states, self.q_proj.weight, self.k_proj.weight, self.v_proj.weight, - self.q_proj.bias, self.k_proj.bias, self.v_proj.bias, position_ids, cache_k, - cache_v, self.q_proj.weight.qtype, self.v_proj.weight.qtype, kv_seq_len, - self.head_dim, self.rotary_emb.base] - query_states, key_states, value_states = xe_linear.forward_qkv_bias(*args) - kv_seq_len += 1 - if self.layer_idx == 0: - past_key_value._seen_tokens = kv_seq_len - past_key_value.key_cache[self.layer_idx] = key_states - past_key_value.value_cache[self.layer_idx] = value_states - else: - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.view(bsz, q_len, - self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, - self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, - self.num_key_value_heads, self.head_dim).transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - invalidInputError( - False, - "The cache structure has changed since version v4.36. " - f"If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, " - "please make sure to initialize the attention class with a layer index." - ) - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - if use_fuse_rope: - query_states, key_states = apply_rotary_pos_emb_cache_freq_xpu(query_states, key_states, - sin, cos, "qwen2_moe", - position_ids) - else: - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, - cos, sin, position_ids) - if past_key_value is not None: - if self.layer_idx == 0: - past_key_value._seen_tokens += key_states.shape[-2] - - if len(past_key_value.key_cache) <= self.layer_idx: - past_key_value.key_cache.append(key_states) - past_key_value.value_cache.append(value_states) - else: - cache_k = past_key_value.key_cache[self.layer_idx] - cache_v = past_key_value.value_cache[self.layer_idx] - - if not enough_kv_room: - # allocate new - new_c_k, new_c_v = extend_kv_cache(bsz, - self.num_key_value_heads, # Support GQA - self.head_dim, - cache_k.size(2), - kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH, - dtype=cache_k.dtype, - device=device) - - new_c_k[:] = cache_k - new_c_v[:] = cache_v - cache_k = new_c_k - cache_v = new_c_v - - key_states, value_states = append_kv_cache(cache_k, - cache_v, - key_states, - value_states) - - # update past_key_value - past_key_value.key_cache[self.layer_idx] = key_states - past_key_value.value_cache[self.layer_idx] = value_states - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - if not self.training and not hidden_states.requires_grad and \ - use_flash_attention(query_states, key_states, attention_mask): - attn_output = F.scaled_dot_product_attention(query_states.to(device, dtype=torch.float16), - key_states.to(device, dtype=torch.float16), - value_states.to(device, dtype=torch.float16), - is_causal=True) - attn_weights = None - else: - attn_weights = torch.matmul(query_states, - key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - - invalidInputError(attn_weights.size() == (bsz, self.num_heads, q_len, kv_seq_len), - ("Attention weights should be of size " - f"{(bsz, self.num_heads, q_len, kv_seq_len)}," - "but is {attn_weights.size()}")) - - if attention_mask is not None: - invalidInputError(attention_mask.size() == (bsz, 1, q_len, kv_seq_len), - (f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}" - f" but is {attention_mask.size()}")) - - attn_weights = attn_weights + attention_mask - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, - dim=-1, dtype=torch.float32).to(query_states.dtype) - attn_weights = nn.functional.dropout(attn_weights, - p=self.attention_dropout, training=self.training) - attn_output = torch.matmul(attn_weights, value_states) - - invalidInputError(attn_output.size() == (bsz, self.num_heads, q_len, self.head_dim), - "`attn_output` should be of size " - f"{(bsz, self.num_heads, q_len, self.head_dim)}," - f" but is {attn_output.size()}") - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output.to(hidden_states.dtype), attn_weights, past_key_value - - -def qwen2moe_attention_forward_sdpa( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - use_fuse_rope = should_use_fuse_rope(self, hidden_states, position_ids) - - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. " - "Please make sure use `attention_mask` instead.`" - ) - bsz, q_len, _ = hidden_states.size() - device = hidden_states.device - - enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value, self.layer_idx) - - qtype_check = decoding_fast_path_qtype_check(self.q_proj) - decoding_fast_path = (qtype_check and use_fuse_rope - and enough_kv_room and bsz * q_len == 1) - decoding_fast_path = decoding_fast_path and not self.q_proj.enable_xetla - if decoding_fast_path: - hidden_states = hidden_states.view(1, -1) - cache_k = past_key_value.key_cache[self.layer_idx] - cache_v = past_key_value.value_cache[self.layer_idx] - kv_seq_len = cache_k.shape[-2] - import xe_linear - args = [hidden_states, self.q_proj.weight, self.k_proj.weight, self.v_proj.weight, - self.q_proj.bias, self.k_proj.bias, self.v_proj.bias, position_ids, cache_k, - cache_v, self.q_proj.weight.qtype, self.v_proj.weight.qtype, kv_seq_len, - self.head_dim, self.rotary_emb.base] - query_states, key_states, value_states = xe_linear.forward_qkv_bias(*args) - kv_seq_len += 1 - if self.layer_idx == 0: - past_key_value._seen_tokens = kv_seq_len - past_key_value.key_cache[self.layer_idx] = key_states - past_key_value.value_cache[self.layer_idx] = value_states - else: - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.view(bsz, q_len, - self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, - self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, - self.num_key_value_heads, self.head_dim).transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - invalidInputError( - False, - "The cache structure has changed since version v4.36. " - f"If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, " - "please make sure to initialize the attention class with a layer index." - ) - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - if use_fuse_rope: - query_states, key_states = apply_rotary_pos_emb_cache_freq_xpu(query_states, key_states, - sin, cos, "qwen2_moe", - position_ids) - else: - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, - cos, sin, position_ids) - if past_key_value is not None: - if self.layer_idx == 0: - past_key_value._seen_tokens += key_states.shape[-2] - - if len(past_key_value.key_cache) <= self.layer_idx: - past_key_value.key_cache.append(key_states) - past_key_value.value_cache.append(value_states) - else: - cache_k = past_key_value.key_cache[self.layer_idx] - cache_v = past_key_value.value_cache[self.layer_idx] - - if not enough_kv_room: - # allocate new - new_c_k, new_c_v = extend_kv_cache(bsz, - self.num_key_value_heads, # Support GQA - self.head_dim, - cache_k.size(2), - kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH, - dtype=cache_k.dtype, - device=device) - - new_c_k[:] = cache_k - new_c_v[:] = cache_v - cache_k = new_c_k - cache_v = new_c_v - - key_states, value_states = append_kv_cache(cache_k, - cache_v, - key_states, - value_states) - - # update past_key_value - past_key_value.key_cache[self.layer_idx] = key_states - past_key_value.value_cache[self.layer_idx] = value_states - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - if output_attentions: - attn_weights = torch.matmul(query_states, - key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - - invalidInputError(attn_weights.size() == (bsz, self.num_heads, q_len, kv_seq_len), - ("Attention weights should be of size " - f"{(bsz, self.num_heads, q_len, kv_seq_len)}," - "but is {attn_weights.size()}")) - - if attention_mask is not None: - invalidInputError(attention_mask.size() == (bsz, 1, q_len, kv_seq_len), - (f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}" - f" but is {attention_mask.size()}")) - - attn_weights = attn_weights + attention_mask - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, - dim=-1, dtype=torch.float32).to(query_states.dtype) - attn_weights = nn.functional.dropout(attn_weights, - p=self.attention_dropout, training=self.training) - else: - attn_weights = None - - from torch.nn.functional import scaled_dot_product_attention as sdpa - attn_output = sdpa(query_states, - key_states, - value_states, - attn_mask=attention_mask, - dropout_p=self.attention_dropout if self.training else 0.0, - is_causal=self.is_causal and attention_mask is None and q_len > 1) - - invalidInputError(attn_output.size() == (bsz, self.num_heads, q_len, self.head_dim), - "`attn_output` should be of size " - f"{(bsz, self.num_heads, q_len, self.head_dim)}," - f" but is {attn_output.size()}") - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - - attn_output = self.o_proj(attn_output) - - return attn_output, attn_weights, past_key_value +def merge_qkv(module: torch.nn.Module): + if isinstance(module, Qwen2MoeAttention): + new_weight = torch.cat([ + module.q_proj.weight.data, + module.k_proj.weight.data, + module.v_proj.weight.data, + ], dim=0) + new_bias = torch.cat([ + module.q_proj.bias.data, + module.k_proj.bias.data, + module.v_proj.bias.data, + ], dim=-1) + + qkv_proj = torch.nn.Linear(0, 0, bias=True) + qkv_proj.weight = torch.nn.Parameter(new_weight, requires_grad=False) + qkv_proj.bias = torch.nn.Parameter(new_bias, requires_grad=False) + qkv_proj.in_features = new_weight.size(1) + qkv_proj.out_features = new_weight.size(0) + module.qkv_proj = qkv_proj + + del module.q_proj, module.k_proj, module.v_proj def qwen2moe_moeblock_forward(self, hidden_states: torch.Tensor): diff --git a/python/llm/src/ipex_llm/transformers/models/stablelm.py b/python/llm/src/ipex_llm/transformers/models/stablelm.py index c8a84557e84..9bef4c292cc 100644 --- a/python/llm/src/ipex_llm/transformers/models/stablelm.py +++ b/python/llm/src/ipex_llm/transformers/models/stablelm.py @@ -38,31 +38,19 @@ # import math -from typing import Optional, Tuple, List, Union +from typing import Optional, Tuple, List import torch -from torch import nn -import torch.nn.functional as F +from transformers.cache_utils import Cache +from transformers.models.stablelm.modeling_stablelm import repeat_kv from transformers.models.stablelm.modeling_stablelm import StableLmAttention, StableLmModel -from transformers.modeling_outputs import BaseModelOutputWithPast -from ipex_llm.utils.common import invalidInputError -from ipex_llm.transformers.models.utils import extend_kv_cache, append_kv_cache from ipex_llm.transformers.models.utils import apply_rotary_pos_emb, \ apply_rotary_pos_emb_cache_freq_xpu -from ipex_llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \ - restore_fp8_kv_cache, use_quantize_kv_cache -from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_36 -from ipex_llm.transformers.models.utils import use_flash_attention, use_sdp -from ipex_llm.transformers.models.mistral import should_use_fuse_rope, repeat_kv -try: - from transformers.cache_utils import Cache -except ImportError: - Cache = Tuple[torch.Tensor] - -import os - -KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) +from ipex_llm.transformers.models.utils import use_sdp, use_sdp_causal +from ipex_llm.transformers.models.utils import restore_fp8_kv_cache, use_quantize_kv_cache +from ipex_llm.transformers.models.utils import should_use_fuse_rope +from ipex_llm.transformers.kv import DynamicFp8Cache, DynamicNormalCache def merge_qkv(module: torch.nn.Module): @@ -92,24 +80,26 @@ def merge_qkv(module: torch.nn.Module): def stablelm_model_forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, -) -> Union[Tuple, BaseModelOutputWithPast]: - from ipex_llm.transformers.kv import DynamicFp8Cache + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, +): + # IPEX-LLM OPT: kv cache and quantize kv cache use_cache = use_cache if use_cache is not None else self.config.use_cache - if use_cache and use_quantize_kv_cache_stablelm(self.layers[0].self_attn.head_dim, - self.layers[0].mlp.up_proj, - input_ids): - if not isinstance(past_key_values, DynamicFp8Cache): + use_quantize_kv = (self.layers[0].self_attn.head_dim in [64, 96, 128] + and use_quantize_kv_cache(self.layers[0].mlp.up_proj, input_ids)) + if use_cache: + if use_quantize_kv and not isinstance(past_key_values, DynamicFp8Cache): past_key_values = DynamicFp8Cache.from_legacy_cache(past_key_values) + if not use_quantize_kv and not isinstance(past_key_values, DynamicNormalCache): + past_key_values = DynamicNormalCache.from_legacy_cache(past_key_values) return StableLmModel.forward( self=self, input_ids=input_ids, @@ -124,10 +114,6 @@ def stablelm_model_forward( ) -def use_quantize_kv_cache_stablelm(head_dim: int, linear: torch.nn.Module, x: torch.Tensor) -> bool: - return (head_dim == 64 or head_dim == 128) and use_quantize_kv_cache(linear, x) - - def stablelm_attention_forward( self, hidden_states: torch.Tensor, @@ -137,55 +123,21 @@ def stablelm_attention_forward( output_attentions: bool = False, use_cache: bool = False, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if use_quantize_kv_cache_stablelm(self.head_dim, self.o_proj, hidden_states): - forward_function = stablelm_attention_forward_quantized - else: - forward_function = stablelm_attention_forward_original - return forward_function( - self=self, - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - - -def stablelm_attention_forward_original( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor]=None, - position_ids: Optional[torch.LongTensor]=None, - past_key_value: Optional[Cache]=None, - output_attentions: bool=False, - use_cache: bool=False, - **kwargs -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]: - bsz, q_len, _ = hidden_states.size() - device = hidden_states.device - # for flash attention - original_dtype = hidden_states.dtype - - use_fuse_rope = should_use_fuse_rope(self, hidden_states, position_ids) - enough_kv_room = is_enough_kv_cache_room_4_36(past_key_value, self.layer_idx) qkv = self.qkv_proj(hidden_states) qkv = qkv.view(bsz, q_len, self.num_heads + 2 * self.num_key_value_heads, self.head_dim) qkv = qkv.transpose(1, 2) query_states, key_states, value_states = qkv.split([self.num_heads, - self.num_heads, - self.num_heads], dim=1) + self.num_key_value_heads, + self.num_key_value_heads], dim=1) + # For stablelm-2-12b's qk per-head norm + if getattr(self, "qk_layernorm", False): + query_states = self.q_layernorm(query_states) + key_states = self.k_layernorm(key_states) kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - invalidInputError(self.layer_idx is not None, - "The cache structure has changed since version v4.36. " - f"If you are using {self.__class__.__name__} for " - "auto-regressive decodingwith k/v caching, please make sure " - "to initialize the attention class with a layer index.") kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) # Partial rotary embedding @@ -198,8 +150,8 @@ def stablelm_attention_forward_original( key_states[..., self.rotary_emb.dim:], ) cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor] - if use_fuse_rope: + # [batch_size, num_heads, seq_length, head_dim // config.partial_rotary_factor] + if should_use_fuse_rope(hidden_states, position_ids, self.training): query_rot, key_rot = apply_rotary_pos_emb_cache_freq_xpu(query_rot, key_rot, sin, @@ -214,251 +166,57 @@ def stablelm_attention_forward_original( position_ids, "stablelm") - # [batch_size, seq_length, num_heads, head_dim] + # [batch_size, num_heads, seq_length, head_dim] query_states = torch.cat((query_rot, query_pass), dim=-1) key_states = torch.cat((key_rot, key_pass), dim=-1) - if past_key_value is not None: - # update the number of seen tokens - if self.layer_idx == 0: - past_key_value.seen_tokens += key_states.shape[-2] + key_states, value_states = past_key_value.update(key_states, value_states, + self.layer_idx, None) - # reuse k, v, self_attention - # update `past_key_value` with `key_states` and `value_states` for layer `layer_idx` - if len(past_key_value.key_cache) <= self.layer_idx: - past_key_value.key_cache.append(key_states) - past_key_value.value_cache.append(value_states) + # IPEX-LLM OPT: sdp + attn_weights = None + if use_sdp(q_len, kv_seq_len, self.head_dim, query_states): + import xe_addons + if isinstance(past_key_value, DynamicFp8Cache): + attn_output = xe_addons.sdp_fp8(query_states, key_states, value_states, + attention_mask) else: - cache_k = past_key_value.key_cache[self.layer_idx] - cache_v = past_key_value.value_cache[self.layer_idx] - - if not enough_kv_room: - # allocate new - new_c_k, new_c_v = extend_kv_cache(bsz, - self.num_key_value_heads, # Support GQA - self.head_dim, - cache_k.size(2), - kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH, - dtype=cache_k.dtype, - device=device) - - new_c_k[:] = cache_k - new_c_v[:] = cache_v - cache_k = new_c_k - cache_v = new_c_v - - key_states, value_states = append_kv_cache(cache_k, cache_v, - key_states, value_states) - - # update past_key_value - past_key_value.key_cache[self.layer_idx] = key_states - past_key_value.value_cache[self.layer_idx] = value_states - - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - if not self.training and not hidden_states.requires_grad and \ - use_flash_attention(query_states, key_states, attention_mask): - attn_output = F.scaled_dot_product_attention(query_states.to(device, dtype=torch.float16), - key_states.to(device, dtype=torch.float16), - value_states.to(device, dtype=torch.float16), - is_causal=True) - attn_weights = None - elif not self.training and not hidden_states.requires_grad and \ - use_sdp(q_len, key_states.shape[2], self.head_dim, query_states): + attn_output = xe_addons.sdp(query_states, key_states, value_states, + attention_mask) + elif use_sdp_causal(q_len, kv_seq_len, self.head_dim, query_states, self.training): import xe_addons - attn_output = xe_addons.sdp(query_states, key_states, value_states, - attention_mask) - attn_output = attn_output.view(query_states.shape) - attn_weights = None - else: - attn_weights = torch.matmul( - query_states, - key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - - invalidInputError( - attn_weights.size() == (bsz, self.num_heads, q_len, kv_seq_len), - f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}," - f" but is {attn_weights.size()}") - - if attention_mask is not None: - invalidInputError( - attention_mask.size() == (bsz, 1, q_len, kv_seq_len), - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}," - f" but is {attention_mask.size()}") - - attn_weights = attn_weights + attention_mask - - # upcast attention to fp32 - attn_weights = \ - nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query_states.dtype) - attn_weights = self.attention_dropout(attn_weights) - - attn_output = torch.matmul(attn_weights, value_states) - - invalidInputError( - attn_output.size() == (bsz, self.num_heads, q_len, self.head_dim), - f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}," - f" but is {attn_output.size()}") - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output.to(original_dtype), attn_weights, past_key_value - - -def stablelm_attention_forward_quantized( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor]=None, - position_ids: Optional[torch.LongTensor]=None, - past_key_value: Optional[Cache]=None, - output_attentions: bool=False, - use_cache: bool=False, - **kwargs -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]: - bsz, q_len, hidden_size = hidden_states.size() - device = hidden_states.device - # for flash attention - original_dtype = hidden_states.dtype - - use_fuse_rope = should_use_fuse_rope(self, hidden_states, position_ids) - qkv = self.qkv_proj(hidden_states) - qkv = qkv.view(bsz, q_len, self.num_heads + 2 * self.num_key_value_heads, self.head_dim) - qkv = qkv.transpose(1, 2) - query_states, key_states, value_states = qkv.split([self.num_heads, - self.num_heads, - self.num_heads], dim=1) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - invalidInputError( - self.layer_idx is not None, - f"The cache structure has changed since version v4.36. " - "If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, " - "please make sure to initialize the attention class " - "with a layer index." - ) - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - - # Partial rotary embedding - query_rot, query_pass = ( - query_states[..., : self.rotary_emb.dim], - query_states[..., self.rotary_emb.dim:], - ) - key_rot, key_pass = ( - key_states[..., : self.rotary_emb.dim], - key_states[..., self.rotary_emb.dim:], - ) - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor] - if use_fuse_rope: - query_rot, key_rot = apply_rotary_pos_emb_cache_freq_xpu(query_rot, - key_rot, - sin, - cos, - "stablelm", - position_ids) - else: - query_rot, key_rot = apply_rotary_pos_emb(query_rot, - key_rot, - cos, - sin, - position_ids, - "stablelm") - - # [batch_size, seq_length, num_heads, head_dim] - query_states = torch.cat((query_rot, query_pass), dim=-1) - key_states = torch.cat((key_rot, key_pass), dim=-1) - - kv_seq_len = key_states.shape[-2] - if len(past_key_value.key_cache) <= self.layer_idx: - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) - attn_weights = attn_weights / math.sqrt(self.head_dim) - - invalidInputError( - attn_weights.size() == (bsz, self.num_heads, q_len, kv_seq_len), - f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}" - f", but is {attn_weights.size()}") - - if attention_mask is not None: - invalidInputError( - attention_mask.size() == (bsz, 1, q_len, kv_seq_len), - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}," - f" but is {attention_mask.size()}") - attn_weights = attn_weights + attention_mask - - # at inference time, for memory considerations, may not need to upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1).to(query_states.dtype) - attn_weights = self.attention_dropout(attn_weights) - - attn_output = torch.matmul(attn_weights, value_states) - - invalidInputError( - attn_output.size() == (bsz, self.num_heads, q_len, self.head_dim), - f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}" - f", but is {attn_output.size()}") - if use_cache: - cache_kwargs = None - key_states, value_states = past_key_value.update(key_states, value_states, - self.layer_idx, cache_kwargs) + if isinstance(past_key_value, DynamicFp8Cache): + attn_output = xe_addons.sdp_fp8_causal(query_states, key_states, + value_states, attention_mask) + else: + attn_output = xe_addons.sdp_causal(query_states, key_states, + value_states, attention_mask) else: - cache_kwargs = None # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, - self.layer_idx, cache_kwargs) - kv_seq_len = key_states.shape[-2] - if query_states.size(2) != 1 or query_states.device.type != 'xpu': + if isinstance(past_key_value, DynamicFp8Cache): key_states, value_states = restore_fp8_kv_cache(key_states, value_states, query_states.dtype) - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) - else: - import xe_addons - attn_weights = xe_addons.query_key_fp8_matmul(query_states, key_states) - attn_weights = attn_weights / math.sqrt(self.head_dim) + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) - invalidInputError( - attn_weights.size() == (bsz, self.num_heads, q_len, kv_seq_len), - f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}" - f", but is {attn_weights.size()}") + attn_weights = torch.matmul(query_states, + key_states.transpose(2, 3)) / math.sqrt(self.head_dim) if attention_mask is not None: - invalidInputError( - attention_mask.size() == (bsz, 1, q_len, kv_seq_len), - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}," - f" but is {attention_mask.size()}") attn_weights = attn_weights + attention_mask - # at inference time, for memory considerations, may not need to upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1) + # upcast attention to fp32 + attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1, + dtype=torch.float32).to(value_states.dtype) attn_weights = self.attention_dropout(attn_weights) - - if query_states.size(2) != 1 or query_states.device.type != 'xpu': - attn_output = torch.matmul(attn_weights, value_states) - else: - import xe_addons - attn_output = xe_addons.attn_value_fp8_matmul(attn_weights, value_states) - - attn_output_size = (bsz, self.num_heads, q_len, self.head_dim) - invalidInputError(attn_output.size() == attn_output_size, - f"`attn_output` should be of size {attn_output_size}," - f" but is {attn_output.size()}") + attn_output = torch.matmul(attn_weights, value_states) attn_output = attn_output.transpose(1, 2).contiguous() attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - attn_output = self.o_proj(attn_output) if not output_attentions: attn_weights = None - return attn_output.to(original_dtype), attn_weights, past_key_value + return attn_output, attn_weights, past_key_value diff --git a/python/llm/src/ipex_llm/transformers/models/starcoder2.py b/python/llm/src/ipex_llm/transformers/models/starcoder2.py index b0e83f48681..654d5c0a654 100644 --- a/python/llm/src/ipex_llm/transformers/models/starcoder2.py +++ b/python/llm/src/ipex_llm/transformers/models/starcoder2.py @@ -42,7 +42,7 @@ from ipex_llm.transformers.models.utils import ( use_quantize_kv_cache, restore_fp8_kv_cache, - apply_rotary_pos_emb_no_cache_xpu + should_use_fuse_rope, use_sdp, use_sdp_causal ) from ipex_llm.transformers.kv import DynamicFp8Cache, DynamicNormalCache from ipex_llm.utils.common.log4Error import invalidInputError @@ -53,16 +53,6 @@ from transformers.models.starcoder2.modeling_starcoder2 import Starcoder2Model, Starcoder2Attention -def should_use_fuse_rope(self, hidden_states, position_ids): - use_fuse_rope = ( - hidden_states.device.type == "xpu" and - hidden_states.numel() == hidden_states.size(-1) and - not (self.training and hidden_states.requires_grad) and - position_ids is not None - ) - return use_fuse_rope - - def merge_qkv(module: torch.nn.Module): if isinstance(module, Starcoder2Attention): new_weight = torch.cat([ @@ -115,12 +105,10 @@ def attention_forward( kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) # IPEX-LLM OPT: fuse rope - if should_use_fuse_rope(self, hidden_states, position_ids): - query_states, key_states = apply_rotary_pos_emb_no_cache_xpu(query_states, - key_states, - position_ids, - "mistral", - self.rope_theta) + if should_use_fuse_rope(hidden_states, position_ids, self.training): + import xe_addons + xe_addons.rotary_half_inplaced(self.rotary_emb.inv_freq, position_ids, + query_states, key_states) else: cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = apply_rotary_pos_emb( @@ -129,21 +117,30 @@ def attention_forward( # IPEX-LLM OPT: kv cache and quantize kv cache invalidInputError(past_key_value is not None, "`past_key_value` cannot be None") - use_quantize_kv = use_quantize_kv_cache(self.o_proj, hidden_states) - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, None) - if use_quantize_kv and q_len == 1: + # IPEX-LLM OPT: sdp + if use_sdp(q_len, kv_seq_len, self.head_dim, query_states): import xe_addons - attn_output = xe_addons.sdp_fp8(query_states, key_states, value_states, + if isinstance(past_key_value, DynamicFp8Cache): + attn_output = xe_addons.sdp_fp8(query_states, key_states, value_states, + attention_mask) + else: + attn_output = xe_addons.sdp(query_states, key_states, value_states, attention_mask) - attn_weights = None + elif use_sdp_causal(q_len, kv_seq_len, self.head_dim, query_states, self.training): + import xe_addons + if isinstance(past_key_value, DynamicFp8Cache): + attn_output = xe_addons.sdp_fp8_causal(query_states, key_states, + value_states, attention_mask) + else: + attn_output = xe_addons.sdp_causal(query_states, key_states, + value_states, attention_mask) else: - if use_quantize_kv: + if isinstance(past_key_value, DynamicFp8Cache): key_states, value_states = restore_fp8_kv_cache(key_states, value_states, query_states.dtype) - # repeat k/v heads if n_kv_heads < n_heads key_states = repeat_kv(key_states, self.num_key_value_groups) value_states = repeat_kv(value_states, self.num_key_value_groups) diff --git a/python/llm/src/ipex_llm/transformers/models/utils.py b/python/llm/src/ipex_llm/transformers/models/utils.py index 3e2878b9542..449d331ada9 100644 --- a/python/llm/src/ipex_llm/transformers/models/utils.py +++ b/python/llm/src/ipex_llm/transformers/models/utils.py @@ -301,6 +301,8 @@ def use_flash_attention(query, key, attention_mask=None): # ipex flash attention is only supported for xetla # may update this later return False + elif get_xpu_device_type(query) != "pvc": + return False if query.dtype not in [torch.float32, torch.float16]: # only use flash attention for fp32/fp16 input return False diff --git a/python/llm/src/ipex_llm/transformers/models/yuan.py b/python/llm/src/ipex_llm/transformers/models/yuan.py index a2d48bb5303..9f480ad382f 100644 --- a/python/llm/src/ipex_llm/transformers/models/yuan.py +++ b/python/llm/src/ipex_llm/transformers/models/yuan.py @@ -20,32 +20,41 @@ # https://huggingface.co/IEITYuan/Yuan2-2B-hf/blob/7ab7b3c18eb8e5232ce2a3f720d4e6f4b53a2806/README.md#%E5%A3%B0%E6%98%8E%E4%B8%8E%E5%8D%8F%E8%AE%AEterms-and-conditions # -import copy import math -from einops import rearrange from typing import Optional, Tuple import torch -import torch.nn as nn from ipex_llm.utils.common import invalidInputError from ipex_llm.transformers.models.utils import apply_rotary_pos_emb, \ - apply_rotary_pos_emb_cache_freq_xpu, mlp_fusion_check, fp16_fusion_check -from ipex_llm.transformers.models.utils import init_kv_cache, extend_kv_cache, append_kv_cache -from ipex_llm.transformers.models.utils import init_fp8_kv_cache, append_fp8_kv_cache, \ - restore_fp8_kv_cache, use_quantize_kv_cache -from ipex_llm.transformers.models.utils import is_enough_kv_cache_room_4_31, SILU - -import os - -KV_CACHE_ALLOC_BLOCK_LENGTH = int(os.environ.get("KV_CACHE_ALLOC_BLOCK_LENGTH", 256)) - - -def should_use_fuse_rope(self, hidden_states, position_ids): - use_fuse_rope = hidden_states.device.type == "xpu" - use_fuse_rope = use_fuse_rope and not (self.training and hidden_states.requires_grad) - use_fuse_rope = use_fuse_rope and position_ids is not None - return use_fuse_rope + mlp_fusion_check, fp16_fusion_check +from ipex_llm.transformers.models.utils import use_quantize_kv_cache, restore_fp8_kv_cache +from ipex_llm.transformers.models.utils import SILU, update_past_key_value +from ipex_llm.transformers.models.utils import should_use_fuse_rope, use_sdp, use_sdp_causal + + +def merge_qk(module: torch.nn.Module): + if "YuanAttention" in module.__class__.__name__: + q_weight = module.q_proj.weight.data + k_weight = module.k_proj.weight.data + num_heads = module.num_heads + head_dim = module.head_dim + hidden_size = module.hidden_size + + merged_qk_proj = torch.nn.Linear(0, 0, False) + weight = torch.cat([ + q_weight.view(num_heads, head_dim, hidden_size)[0::2, :, :], + k_weight.view(num_heads, head_dim, hidden_size)[0::2, :, :], + q_weight.view(num_heads, head_dim, hidden_size)[1::2, :, :], + k_weight.view(num_heads, head_dim, hidden_size)[1::2, :, :], + ], dim=0).view(num_heads * head_dim * 2, hidden_size) + merged_qk_proj.weight = torch.nn.Parameter(weight, requires_grad=False) + merged_qk_proj.in_features = hidden_size + merged_qk_proj.out_features = num_heads * head_dim * 2 + module.qk_proj = merged_qk_proj + + del module.q_proj + del module.k_proj def yuan_localized_filtering_forward( @@ -142,43 +151,14 @@ def yuan_attention_forward( past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: bool = False, use_cache: bool = False, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if use_quantize_kv_cache(self.merged_q_proj, hidden_states): - forward_function = yuan_attention_forward_quantized - else: - forward_function = yuan_attention_forward_origin - return forward_function( - self=self, - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - - -def yuan_attention_forward_quantized( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() device = hidden_states.device - before_hidden_states = None - is_first_step = False - - use_fuse_rope = should_use_fuse_rope(self, hidden_states, position_ids) invalidInputError(use_cache, "use_cache=True is needed") invalidInputError(not self.use_shareqk, "use_shareqk is not supported for now") if past_key_value is None: - is_first_step = True if q_len >= 2: before_hidden_states = hidden_states[:, -2:, :].transpose(0, 1).half() else: @@ -193,112 +173,75 @@ def yuan_attention_forward_quantized( ], dim=0) before_hidden_states = this_hidden_states[-2:, :, ] - value_states = \ - self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + value_states = self.v_proj(hidden_states) + value_states = value_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - if is_first_step: + if past_key_value is None: hidden_states = yuan_localized_filtering_forward(self.lf_gate, hidden_states, None, hidden_states.dtype) else: hidden_states = yuan_localized_filtering_forward(self.lf_gate, hidden_states, this_hidden_states, hidden_states.dtype) - query_states = self.merged_q_proj(hidden_states) - key_states = self.merged_k_proj(hidden_states) - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + + qk_states = self.qk_proj(hidden_states) + qk_states = qk_states.view(bsz, q_len, self.num_heads * 2, self.head_dim) + qk_states = qk_states.transpose(1, 2) + query_states, key_states = torch.chunk(qk_states, 2, dim=1) kv_seq_len = key_states.shape[-2] if past_key_value is not None: kv_seq_len += past_key_value[0].shape[-2] - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - if use_fuse_rope: - query_states, key_states = apply_rotary_pos_emb_cache_freq_xpu(query_states, - key_states, - sin, cos, - "yuan", - position_ids) + if should_use_fuse_rope(hidden_states, position_ids, self.training): + import xe_addons + xe_addons.rotary_half_inplaced(self.rotary_emb.inv_freq, position_ids, + query_states, key_states) else: + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids, "yuan") - if past_key_value is None: - # should use origin attn here - attn_weights = torch.matmul(query_states, - key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - - invalidInputError(attn_weights.size() == (bsz, self.num_heads, q_len, kv_seq_len), - "Attention weights should be of size " - f"{(bsz, self.num_heads, q_len, kv_seq_len)}, " - f"but is {attn_weights.size()}") - - if attention_mask is not None: - invalidInputError(attention_mask.size() == (bsz, 1, q_len, kv_seq_len), - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, " - f"but is {attention_mask.size()}") - attn_weights = attn_weights + attention_mask - attn_weights = torch.max(attn_weights, - torch.tensor(torch.finfo(attn_weights.dtype).min)) - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, - dtype=torch.float32).to(query_states.dtype) - attn_output = torch.matmul(attn_weights, value_states) - - if use_cache: - k_cache, v_cache = init_fp8_kv_cache( - bsz, self.num_heads, kv_seq_len, self.head_dim, device=device - ) - key_states, value_states = append_fp8_kv_cache(k_cache, v_cache, - key_states, value_states) - past_key_value = (key_states, value_states, before_hidden_states) - + # IPEX-LLM OPT: kv cache and quantzie kv cache + use_quantize_kv = use_quantize_kv_cache(self.qk_proj, hidden_states) + key_states, value_states = update_past_key_value( + None if past_key_value is None else (past_key_value[0], past_key_value[1]), + key_states, value_states, + kv_seq_len, use_quantize_kv, device + ) + past_key_value = (key_states, value_states, before_hidden_states) if use_cache else None + + # IPEX-LLM OPT: sdp + if use_sdp(q_len, kv_seq_len, self.head_dim, query_states): + import xe_addons + if use_quantize_kv: + attn_output = xe_addons.sdp_fp8(query_states, key_states, value_states, + attention_mask) + else: + attn_output = xe_addons.sdp(query_states, key_states, value_states, + attention_mask) + elif use_sdp_causal(q_len, kv_seq_len, self.head_dim, query_states, self.training): + import xe_addons + if use_quantize_kv: + attn_output = xe_addons.sdp_fp8_causal(query_states, key_states, + value_states, attention_mask) + else: + attn_output = xe_addons.sdp_causal(query_states, key_states, + value_states, attention_mask) else: - k_cache, v_cache, _ = past_key_value - key_states, value_states = append_fp8_kv_cache(k_cache, v_cache, - key_states, value_states) - past_key_value = (key_states, value_states, before_hidden_states) - - # torch.matmul - if query_states.size(2) != 1 or device.type != 'xpu': + if use_quantize_kv: key_states, value_states = restore_fp8_kv_cache(key_states, value_states, query_states.dtype) - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) - else: - import xe_addons - attn_weights = xe_addons.query_key_fp8_matmul(query_states, key_states) - - attn_weights = attn_weights / math.sqrt(self.head_dim) - - invalidInputError(attn_weights.size() == (bsz, self.num_heads, q_len, kv_seq_len), - "Attention weights should be of size " - f"{(bsz, self.num_heads, q_len, kv_seq_len)}, " - f"but is {attn_weights.size()}") - + attn_weights = torch.matmul(query_states, + key_states.transpose(2, 3)) / math.sqrt(self.head_dim) if attention_mask is not None: - invalidInputError(attention_mask.size() == (bsz, 1, q_len, kv_seq_len), - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, " - f"but is {attention_mask.size()}") attn_weights = attn_weights + attention_mask - attn_weights = torch.max(attn_weights, - torch.tensor(torch.finfo(attn_weights.dtype).min)) - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, - dtype=torch.float32).to(query_states.dtype) - if query_states.size(2) != 1 or device.type != 'xpu': - attn_output = torch.matmul(attn_weights, value_states) - else: - import xe_addons - attn_output = xe_addons.attn_value_fp8_matmul(attn_weights, value_states) - - invalidInputError(attn_output.size() == (bsz, self.num_heads, q_len, self.head_dim), - "`attn_output` should be of size " - f"{(bsz, self.num_heads, q_len, self.head_dim)}, " - f"but is {attn_output.size()}") + attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1, + dtype=torch.float32).to(value_states.dtype) + attn_output = torch.matmul(attn_weights, value_states) attn_output = attn_output.transpose(1, 2) attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) @@ -307,146 +250,3 @@ def yuan_attention_forward_quantized( attn_weights = None return attn_output, attn_weights, past_key_value - - -def yuan_attention_forward_origin( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - use_fuse_rope = should_use_fuse_rope(self, hidden_states, position_ids) - bsz, q_len, _ = hidden_states.size() - device = hidden_states.device - before_hidden_states = None - is_first_step = False - self.use_shareqk = False - - enough_kv_room = is_enough_kv_cache_room_4_31(past_key_value) - - invalidInputError(use_cache, "use_cache=True is needed") - invalidInputError(not self.use_shareqk, "use_shareqk is not supported for now") - - if past_key_value is None: - is_first_step = True - if q_len >= 2: - before_hidden_states = hidden_states[:, -2:, :].transpose(0, 1).half() - else: - before_hidden_states = torch.zeros(2, bsz, self.hidden_size, - dtype=torch.half, device=hidden_states.device) - before_hidden_states[-1:, :, :] = hidden_states[:, -1:, :].transpose(0, 1) - else: - before_hidden_states = past_key_value[2] - this_hidden_states = torch.cat([ - before_hidden_states, - hidden_states.transpose(0, 1).half(), - ], dim=0) - before_hidden_states = this_hidden_states[-2:, :, ] - - value_states = \ - self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - - if is_first_step: - hidden_states = yuan_localized_filtering_forward(self.lf_gate, hidden_states, - None, hidden_states.dtype) - else: - hidden_states = yuan_localized_filtering_forward(self.lf_gate, hidden_states, - this_hidden_states, hidden_states.dtype) - query_states = self.merged_q_proj(hidden_states) - key_states = self.merged_k_proj(hidden_states) - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value[0].shape[-2] - - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - if use_fuse_rope: - query_states, key_states = apply_rotary_pos_emb_cache_freq_xpu(query_states, - key_states, - sin, cos, - "yuan", - position_ids) - else: - query_states, key_states = apply_rotary_pos_emb(query_states, - key_states, - cos, sin, - position_ids, - "yuan") - - if past_key_value is not None: - # reuse k, v, self_attention - cache_k = past_key_value[0] - cache_v = past_key_value[1] - if not enough_kv_room: - # allocate new - new_cache_k, new_cache_v = extend_kv_cache(bsz, - self.num_heads, - self.head_dim, - cache_k.size(2), - kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH, - dtype=cache_k.dtype, - device=device) - new_cache_k[:] = cache_k - new_cache_v[:] = cache_v - cache_k = new_cache_k - cache_v = new_cache_v - - key_states, value_states = append_kv_cache(cache_k, cache_v, key_states, value_states) - - elif use_cache: - max_cache_length = kv_seq_len + KV_CACHE_ALLOC_BLOCK_LENGTH - new_key_states, new_value_states = init_kv_cache(bsz, - self.num_heads, - self.head_dim, - kv_seq_len, - max_cache_length, - dtype=key_states.dtype, - device=device) - new_key_states[:] = key_states - new_value_states[:] = value_states - key_states = new_key_states - value_states = new_value_states - - past_key_value = \ - (key_states, value_states, before_hidden_states) if use_cache else None - - attn_weights = \ - torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - - invalidInputError(attn_weights.size() == (bsz, self.num_heads, q_len, kv_seq_len), - "Attention weights should be of size " - f"{(bsz, self.num_heads, q_len, kv_seq_len)}, " - f"but is {attn_weights.size()}") - - if attention_mask is not None: - invalidInputError(attention_mask.size() == (bsz, 1, q_len, kv_seq_len), - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, " - f"but is {attention_mask.size()}") - attn_weights = attn_weights + attention_mask - attn_weights = torch.max(attn_weights, - torch.tensor(torch.finfo(attn_weights.dtype).min)) - - # upcast attention to fp32 - attn_weights = \ - torch.nn.functional.softmax(attn_weights, - dim=-1, - dtype=torch.float32).to(query_states.dtype) - attn_output = torch.matmul(attn_weights, value_states) - - invalidInputError(attn_output.size() == (bsz, self.num_heads, q_len, self.head_dim), - "`attn_output` should be of size " - f"{(bsz, self.num_heads, q_len, self.head_dim)}, " - f"but is {attn_output.size()}") - - attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - return attn_output, attn_weights, past_key_value diff --git a/python/llm/src/ipex_llm/transformers/speculative.py b/python/llm/src/ipex_llm/transformers/speculative.py index 2f123659dfb..6d2e08423ad 100644 --- a/python/llm/src/ipex_llm/transformers/speculative.py +++ b/python/llm/src/ipex_llm/transformers/speculative.py @@ -162,6 +162,8 @@ def clear_benchmarks(self): self.generate_time = [] self.draft_time = [] self.verify_time = [] + self.match_time = [] + self.post_time = [] self.draft_num = [] self.accept_num = [] self.n_drafted = 0 diff --git a/python/llm/src/ipex_llm/utils/ipex_importer.py b/python/llm/src/ipex_llm/utils/ipex_importer.py index bf5f6ff0b39..0b60e48c46b 100644 --- a/python/llm/src/ipex_llm/utils/ipex_importer.py +++ b/python/llm/src/ipex_llm/utils/ipex_importer.py @@ -16,6 +16,46 @@ from importlib.metadata import distribution, PackageNotFoundError import logging +import builtins +import sys +from ipex_llm.utils.common import log4Error +import inspect + +# Save the original __import__ function +original_import = builtins.__import__ +ipex_duplicate_import_error = "intel_extension_for_pytorch has already been automatically " + \ + "imported. Please avoid importing it again!" + + +def get_calling_package(): + """ + Return calling package name, e.g., ipex_llm.transformers + """ + # Get the current stack frame + frame = inspect.currentframe() + # Get the caller's frame + caller_frame = frame.f_back.f_back + # Get the caller's module + module = inspect.getmodule(caller_frame) + if module: + # Return the module's package name + return module.__package__ + return None + + +def custom_ipex_import(name, globals=None, locals=None, fromlist=(), level=0): + """ + Custom import function to avoid importing ipex again + """ + # check import calling pacage + calling_package = get_calling_package() + if calling_package is not None: + return original_import(name, globals, locals, fromlist, level) + # Only check ipex for main thread + if name == "ipex" or name == "intel_extension_for_pytorch": + log4Error.invalidInputError(False, + ipex_duplicate_import_error) + return original_import(name, globals, locals, fromlist, level) class IPEXImporter: @@ -51,15 +91,36 @@ def is_xpu_version_installed(): def import_ipex(self): """ - Try to import Intel Extension for PyTorch as ipex + Try to import Intel Extension for PyTorch as ipex for XPU - Raises ImportError if failed + Raises ImportError and invalidInputError if failed """ if self.is_xpu_version_installed(): - import intel_extension_for_pytorch as ipex + # Check if user import ipex manually + if 'ipex' in sys.modules or 'intel_extension_for_pytorch' in sys.modules: + log4Error.invalidInputError(False, + ipex_duplicate_import_error) + self.directly_import_ipex() self.ipex_version = ipex.__version__ + # Replace default importer + builtins.__import__ = custom_ipex_import logging.info("intel_extension_for_pytorch auto imported") + def directly_import_ipex(self): + """ + Try to import Intel Extension for PyTorch as ipex + + Raises ImportError and invalidInputError if failed + """ + # import ipex + import intel_extension_for_pytorch as ipex + if ipex is not None: + # Expose ipex to Python builtins + builtins.ipex = ipex + else: + log4Error.invalidInputError(False, + "Can not import intel_extension_for_pytorch.") + def get_ipex_version(self): """ Get ipex version @@ -69,11 +130,10 @@ def get_ipex_version(self): if self.ipex_version is not None: return self.ipex_version # try to import Intel Extension for PyTorch and get version - try: - import intel_extension_for_pytorch as ipex - self.ipex_version = ipex.__version__ - except ImportError: - self.ipex_version = None + self.directly_import_ipex() + self.ipex_version = ipex.__version__ + # Replace default importer + builtins.__import__ = custom_ipex_import return self.ipex_version diff --git a/python/llm/src/ipex_llm/vllm/cpu/engine/engine.py b/python/llm/src/ipex_llm/vllm/cpu/engine/engine.py index 546c25dd346..7a964762384 100644 --- a/python/llm/src/ipex_llm/vllm/cpu/engine/engine.py +++ b/python/llm/src/ipex_llm/vllm/cpu/engine/engine.py @@ -37,7 +37,7 @@ def from_engine_args( engine_args: AsyncEngineArgs, start_engine_loop: bool = True, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - load_in_low_bit: str = "sym_int4", + load_in_low_bit: Optional[str] = None, ) -> "AsyncLLMEngine": """Creates an async LLM engine from the engine arguments.""" # Enable ipex-llm optimizations @@ -97,7 +97,7 @@ def __init__( max_context_len_to_capture: Optional[int] = None, max_seq_len_to_capture: int = 8192, disable_custom_all_reduce: bool = False, - load_in_low_bit: str = "sym_int4", + load_in_low_bit: Optional[str] = None, **kwargs, ) -> None: if "disable_log_stats" not in kwargs: @@ -136,8 +136,7 @@ def from_engine_args( cls, engine_args: EngineArgs, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - load_in_low_bit: str = "sym_int4", - # ipex_llm_optimize_mode: str = 'NATIVE', + load_in_low_bit: Optional[str] = None, ) -> "LLMEngine": """Creates an LLM engine from the engine arguments.""" # Create the engine configs. diff --git a/python/llm/src/ipex_llm/vllm/cpu/entrypoints/openai/api_server.py b/python/llm/src/ipex_llm/vllm/cpu/entrypoints/openai/api_server.py index 09a02a4a2f3..31991d5028e 100644 --- a/python/llm/src/ipex_llm/vllm/cpu/entrypoints/openai/api_server.py +++ b/python/llm/src/ipex_llm/vllm/cpu/entrypoints/openai/api_server.py @@ -65,7 +65,7 @@ def parse_args(): parser.add_argument( "--load-in-low-bit", type=str, - default="sym_int4", + default=None, help="Low-bit quantization for IPEX-LLM models") return parser.parse_args() diff --git a/python/llm/src/ipex_llm/vllm/cpu/model_convert.py b/python/llm/src/ipex_llm/vllm/cpu/model_convert.py index 4228eb0619d..ff6515426d8 100644 --- a/python/llm/src/ipex_llm/vllm/cpu/model_convert.py +++ b/python/llm/src/ipex_llm/vllm/cpu/model_convert.py @@ -16,6 +16,7 @@ import torch from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model +from vllm.model_executor.model_loader.utils import get_model_architecture from vllm.model_executor.models.llama import LlamaMLP, LlamaAttention from vllm.model_executor.models.qwen2 import Qwen2MLP, Qwen2Attention from vllm.model_executor.models.qwen import QWenMLP, QWenAttention @@ -24,8 +25,13 @@ from vllm.attention import Attention, AttentionMetadata from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager from vllm.config import DeviceConfig -from typing import Tuple +from vllm.logger import init_logger + +from vllm._C import ops from ipex_llm.utils.common import invalidInputError +from typing import List, Optional, Tuple, Union + +logger = init_logger(__name__) def _MLP_forward(self, x): @@ -42,7 +48,7 @@ def _Attention_forward( kv_cache: torch.Tensor, attn_metadata: AttentionMetadata, ) -> torch.Tensor: - qkv = self.qkv_proj(hidden_states) + qkv = self.qkv_proj(hidden_states).to(dtype=kv_cache.dtype) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) attn_output = self.attn(q, k, v, kv_cache, attn_metadata, self.kv_scale) @@ -57,10 +63,10 @@ def _QWen_Attention_forward( kv_cache: Tuple[torch.Tensor, torch.Tensor], attn_metadata: AttentionMetadata, ) -> torch.Tensor: - qkv = self.c_attn(hidden_states) + qkv = self.c_attn(hidden_states).to(dtype=kv_cache.dtype) q, k, v = qkv.chunk(chunks=3, dim=-1) q, k = self.rotary_emb(positions, q, k) - attn_output = self.attn(q, k, v, kv_cache, attn_metadata, self.kv_scale) + attn_output = self.attn(q, k, v, kv_cache, attn_metadata) output = self.c_proj(attn_output) return output @@ -72,6 +78,21 @@ def _QWen_MLP_forward(self, x): return x +def _Qwen2_Attention_forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, +) -> torch.Tensor: + qkv = self.qkv_proj(hidden_states).to(dtype=kv_cache.dtype) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v, kv_cache, attn_metadata) + output = self.o_proj(attn_output) + return output + + def _ChatGLM_MLP_forward(self, hidden_states): # [s, b, 4hp] intermediate_parallel = self.dense_h_to_4h(hidden_states) @@ -88,11 +109,11 @@ def _Baichuan_Attention_forward( kv_cache: Tuple[torch.Tensor, torch.Tensor], attn_metadata: AttentionMetadata, ) -> torch.Tensor: - qkv = self.W_pack(hidden_states) + qkv = self.W_pack(hidden_states).to(dtype=kv_cache.dtype) q, k, v = qkv.chunk(chunks=3, dim=-1) if self.postion_embedding != "ALIBI": q, k = self.rotary_emb(positions, q, k) - attn_output = self.attn(q, k, v, kv_cache, attn_metadata, self.kv_scale) + attn_output = self.attn(q, k, v, kv_cache, attn_metadata) output = self.o_proj(attn_output) return output @@ -104,7 +125,7 @@ def _ChatGLM_Attention_forward( kv_cache: Tuple[torch.Tensor, torch.Tensor], attn_metadata: AttentionMetadata, ) -> torch.Tensor: - qkv = self.query_key_value(hidden_states) + qkv = self.query_key_value(hidden_states).to(dtype=kv_cache.dtype) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(position_ids, q, k) context_layer = self.attn( @@ -121,18 +142,25 @@ def _ChatGLM_Attention_forward( LlamaMLP: _MLP_forward, Qwen2MLP: _MLP_forward, BaiChuanMLP: _MLP_forward, - QWenMLP: _QWen_MLP_forward, + # QWenMLP: _QWen_MLP_forward, GLMMLP: _ChatGLM_MLP_forward } _REPLACED_ATTENTION_LAYERS = { LlamaAttention: _Attention_forward, - Qwen2Attention: _Attention_forward, - QWenAttention: _QWen_Attention_forward, + Qwen2Attention: _Qwen2_Attention_forward, + # QWenAttention: _QWen_Attention_forward, BaiChuanAttention: _Baichuan_Attention_forward, GLMAttention: _ChatGLM_Attention_forward } +_IPEX_LLM_SUPPORTED_MODELS = [ + "LlamaForCausalLM", + "BaichuanForCausalLM", + "ChatGLMForCausalLM", + "Qwen2ForCausalLM", +] + def _model_mlp_convert(): for module, replaced_func in _REPLACED_MLP_LAYERS.items(): @@ -145,37 +173,100 @@ def _model_attention_convert(): def _ipex_llm_convert(load_in_low_bit): - from vllm.worker.model_runner import ModelRunner + if load_in_low_bit is None: + return + from vllm.worker.cpu_model_runner import CPUModelRunner import vllm.model_executor.model_loader as model_loader - setattr(ModelRunner, "load_model", get_load_function(load_in_low_bit)) + setattr(CPUModelRunner, "load_model", get_load_function(load_in_low_bit)) + + from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding + setattr(RotaryEmbedding, "forward", _ipex_llm_rotary_embedding_forward) + from vllm.model_executor.layers.layernorm import RMSNorm + setattr(RMSNorm, "forward", _ipex_llm_rmsnorm_forward) + + +def _ipex_llm_rotary_embedding_forward( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, +) -> Tuple[torch.Tensor, torch.Tensor]: + self.cos_sin_cache = self.cos_sin_cache.to(positions.device, dtype=query.dtype) + + # ops.rotary_embedding()/batched_rotary_embedding() + # are in-place operations that update the query and key tensors. + if offsets is not None: + ops.batched_rotary_embedding(positions, query, key, self.head_size, + self.cos_sin_cache, + self.is_neox_style, self.rotary_dim, + offsets) + else: + ops.rotary_embedding(positions, query, key, self.head_size, + self.cos_sin_cache, self.is_neox_style) + return query, key + + +def _ipex_llm_rmsnorm_forward( + self, + x: torch.Tensor, + residual: Optional[torch.Tensor] = None, +) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + x = x.to(dtype=self.weight.data.dtype) + if residual is not None: + residual = residual.to(dtype=self.weight.data.dtype) + ops.fused_add_rms_norm( + x, + residual, + self.weight.data, + self.variance_epsilon, + ) + return x, residual + out = torch.empty_like(x) + ops.rms_norm( + out, + x, + self.weight.data, + self.variance_epsilon, + ) + return out def get_load_function(low_bit): def _ipex_llm_load_model(self) -> None: + model_class = get_model_architecture(self.model_config)[1] + cur_model_list = ", ".join(_IPEX_LLM_SUPPORTED_MODELS) + if low_bit != "bf16": + invalidInputError(model_class in _IPEX_LLM_SUPPORTED_MODELS, + f"Currently IPEX-LLM vLLM convert only support {cur_model_list}.") + else: + if model_class not in _IPEX_LLM_SUPPORTED_MODELS: + logger.warning( + f"Currently IPEX-LLM vLLM convert only support {cur_model_list}." + ) + self.model = get_model( + model_config=self.model_config, + load_config=self.load_config, + device_config=self.device_config, + vision_language_config=self.vision_language_config, + lora_config=self.lora_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config) + return + _model_mlp_convert() _model_attention_convert() - self.model = get_model(self.model_config, - self.device_config, - lora_config=self.lora_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config) + self.model = get_model( + model_config=self.model_config, + load_config=self.load_config, + device_config=self.device_config, + vision_language_config=self.vision_language_config, + lora_config=self.lora_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config) + from ipex_llm import optimize_model optimize_model(self.model, low_bit=low_bit, torch_dtype=self.model_config.dtype) - if self.lora_config: - invalidInputError(hasattr(self.model, "supported_lora_modules") - and self.model.supported_lora_modules, - "Model does not support LoRA") - invalidInputError(hasattr(self.model, "embedding_modules"), - "Model does not have embedding_modules") - invalidInputError(hasattr(self.model, "embedding_padding_modules"), - "Model does not have embedding_padding_modules") - self.lora_manager = LRUCacheWorkerLoRAManager( - self.scheduler_config.max_num_seqs, - self.scheduler_config.max_num_batched_tokens + - self.scheduler_config.max_paddings, self.vocab_size, - self.lora_config, self.device, self.model.embedding_modules, - self.model.embedding_padding_modules) - self.model = self.lora_manager.create_lora_manager(self.model) return _ipex_llm_load_model diff --git a/python/llm/test/benchmark/arc-perf-test-batch2.yaml b/python/llm/test/benchmark/arc-perf-test-batch2.yaml new file mode 100644 index 00000000000..00b2e4c1a53 --- /dev/null +++ b/python/llm/test/benchmark/arc-perf-test-batch2.yaml @@ -0,0 +1,38 @@ +repo_id: + - 'meta-llama/Llama-2-7b-chat-hf' + - 'meta-llama/Llama-2-13b-chat-hf' + - 'THUDM/chatglm2-6b' + - 'THUDM/chatglm3-6b-4bit' + - 'tiiuae/falcon-7b-instruct-with-patch' + - 'mosaicml/mpt-7b-chat' + - 'redpajama/gptneox-7b-redpajama-bf16' + - 'bigcode/starcoder-15.5b-4bit' + - 'databricks/dolly-v1-6b' + - 'databricks/dolly-v2-7b' + - 'databricks/dolly-v2-12b' + - 'internlm/internlm-chat-7b' + - 'Qwen/Qwen-7B-Chat' + - 'BAAI/AquilaChat-7B' + - 'baichuan-inc/Baichuan2-7B-Chat' + - 'baichuan-inc/Baichuan2-13B-Chat-4bit' + - 'bigscience/bloomz-7b1' +# - 'fnlp/moss-moon-003-sft-4bit' # moss-moon-003-sft cannot work on transformers 4.34+ + - 'mistralai/Mistral-7B-v0.1' +local_model_hub: '/mnt/disk1/models' +warm_up: 1 +num_trials: 3 +num_beams: 1 # default to greedy search +low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) +batch_size: 2 # default to 1 +in_out_pairs: + - '32-32' + - '1024-128' + - '2048-256' +test_api: + - "transformer_int4_gpu" # on Intel GPU +cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) +exclude: + - 'bigcode/starcoder-15.5b-4bit:2048' + - 'databricks/dolly-v2-12b:2048' + - 'baichuan-inc/Baichuan2-13B-Chat-4bit:2048' + - 'bigscience/bloomz-7b1:2048' \ No newline at end of file diff --git a/python/llm/test/benchmark/arc-perf-transformers-437-batch2.yaml b/python/llm/test/benchmark/arc-perf-transformers-437-batch2.yaml new file mode 100644 index 00000000000..c9644dc905c --- /dev/null +++ b/python/llm/test/benchmark/arc-perf-transformers-437-batch2.yaml @@ -0,0 +1,19 @@ +# For the models that require transformers 4.37.0 +repo_id: + - 'Qwen/Qwen1.5-7B-Chat' + - 'microsoft/phi-2' + - 'microsoft/Phi-3-mini-4k-instruct' + - 'meta-llama/Meta-Llama-3-8B-Instruct' +local_model_hub: '/mnt/disk1/models' +warm_up: 1 +num_trials: 3 +num_beams: 1 # default to greedy search +low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) +batch_size: 2 # default to 1 +in_out_pairs: + - '32-32' + - '1024-128' + - '2048-256' +test_api: + - "transformer_int4_gpu" # on Intel GPU +cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) diff --git a/python/llm/test/benchmark/check_results.py b/python/llm/test/benchmark/check_results.py index 528c41df641..861c3ecf267 100644 --- a/python/llm/test/benchmark/check_results.py +++ b/python/llm/test/benchmark/check_results.py @@ -34,16 +34,20 @@ def main(): actual_test_num = len(csv_dataframe) actual_test_cases = [] for index, row in csv_dataframe.iterrows(): - actual_test_cases.append(row['model'] + ":" + row['input/output tokens'].split('-')[0]) - + actual_test_cases.append(row['model'] + ":" + row['input/output tokens'].split('-')[0] + ":" + str(row['batch_size'])) if args.yaml_name: yaml_name = args.yaml_name conf = OmegaConf.load(yaml_name) all_test_cases = [] for model in conf.repo_id: for in_out in conf['in_out_pairs']: - model_id_input = model + ':' + in_out.split('-')[0] - all_test_cases.append(model_id_input) + if not OmegaConf.is_list(conf["batch_size"]): + batch_list = [conf["batch_size"]] + else: + batch_list = conf["batch_size"] + for batch_size in batch_list: + model_id_input = model + ':' + in_out.split('-')[0] + ':' + str(batch_size) + all_test_cases.append(model_id_input) exclude_test_cases = [] if 'exclude' in conf and conf['exclude'] is not None: exclude_test_cases = conf['exclude'] diff --git a/python/llm/test/benchmark/csv_to_html.py b/python/llm/test/benchmark/csv_to_html.py index 9b146f9a16f..2720b338abb 100644 --- a/python/llm/test/benchmark/csv_to_html.py +++ b/python/llm/test/benchmark/csv_to_html.py @@ -99,10 +99,15 @@ def main(): for current_csv_ind,current_csv_row in current_csv.iterrows(): current_csv_model=current_csv_row['model'].strip() current_csv_input_output_pairs=current_csv_row['input/output tokens'].strip() - current_csv_model_input_1st=current_csv_model+'-'+current_csv_input_output_pairs+'-'+'1st' - current_csv_model_input_2nd=current_csv_model+'-'+current_csv_input_output_pairs+'-'+'2nd' - add_to_dict(csv_dict, current_csv_model_input_1st, current_csv_row[latency_1st_token]) - add_to_dict(csv_dict, current_csv_model_input_2nd, current_csv_row[latency_2_avg]) + try: + current_csv_batch_size=str(current_csv_row['batch_size']) + current_csv_model_input_1st=current_csv_model+'-'+current_csv_input_output_pairs+'-'+current_csv_batch_size+'-'+'1st' + current_csv_model_input_2nd=current_csv_model+'-'+current_csv_input_output_pairs+'-'+current_csv_batch_size+'-'+'2nd' + add_to_dict(csv_dict, current_csv_model_input_1st, current_csv_row[latency_1st_token]) + add_to_dict(csv_dict, current_csv_model_input_2nd, current_csv_row[latency_2_avg]) + except KeyError: + #Old csv/html files didn't include 'batch_size' + pass for latest_csv_ind,latest_csv_row in latest_csv.iterrows(): @@ -110,9 +115,10 @@ def main(): latest_csv_input_output_pairs=latest_csv_row['input/output tokens'].strip() latest_1st_token_latency=latest_csv_row[latency_1st_token] latest_2_avg_latency=latest_csv_row[latency_2_avg] + latest_csv_batch_size=str(latest_csv_row['batch_size']) - key1=latest_csv_model+'-'+latest_csv_input_output_pairs+'-'+'1st' - key2=latest_csv_model+'-'+latest_csv_input_output_pairs+'-'+'2nd' + key1=latest_csv_model+'-'+latest_csv_input_output_pairs+'-'+latest_csv_batch_size+'-'+'1st' + key2=latest_csv_model+'-'+latest_csv_input_output_pairs+'-'+latest_csv_batch_size+'-'+'2nd' best_last1_value=best_in_dict(csv_dict, key1, latest_1st_token_latency) best_last2_value=best_in_dict(csv_dict, key2, latest_2_avg_latency) @@ -128,8 +134,9 @@ def main(): previous_csv_model=previous_csv_row['model'].strip() previous_csv_input_output_pairs=previous_csv_row['input/output tokens'].strip() + previous_csv_batch_size=str(previous_csv_row['batch_size']) - if latest_csv_model==previous_csv_model and latest_csv_input_output_pairs==previous_csv_input_output_pairs: + if latest_csv_model==previous_csv_model and latest_csv_input_output_pairs==previous_csv_input_output_pairs and latest_csv_batch_size==previous_csv_batch_size: previous_1st_token_latency=previous_csv_row[latency_1st_token] previous_2_avg_latency=previous_csv_row[latency_2_avg] diff --git a/python/llm/test/benchmark/merge_csv_batch.py b/python/llm/test/benchmark/merge_csv_batch.py new file mode 100644 index 00000000000..453f46c4bf5 --- /dev/null +++ b/python/llm/test/benchmark/merge_csv_batch.py @@ -0,0 +1,45 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Python program to concat CSVs + +import os +import sys +import argparse +import pandas as pd + +def main(): + parser = argparse.ArgumentParser(description="concat .csv files") + parser.add_argument("-f", "--folder_path", type=str, dest="folder_path", + help="The directory which stores the .csv files", default="./") + args = parser.parse_args() + + csv_files = [] + for file_name in os.listdir(args.folder_path): + file_path = os.path.join(args.folder_path, file_name) + if os.path.isfile(file_path) and file_name.endswith(".csv"): + csv_files.append(file_path) + csv_files.sort() + + merged_df = pd.concat([pd.read_csv(file, index_col=0) for file in csv_files], ignore_index=True) + merged_df["input_len"] = merged_df["input/output tokens"].apply(lambda x: int(x.split("-")[0])) + merged_df = merged_df.sort_values(by=["model", "input_len", "batch_size"]) + merged_df.reset_index(drop=True, inplace=True) + merged_csv = csv_files[0].replace("_batch1", "").replace("_batch2", "").replace("_batch4", "") + merged_df.drop("input_len", axis=1).to_csv(merged_csv) + +if __name__ == "__main__": + sys.exit(main()) diff --git a/python/llm/tpp/README.md b/python/llm/tpp/README.md new file mode 100644 index 00000000000..7652e80c067 --- /dev/null +++ b/python/llm/tpp/README.md @@ -0,0 +1,3 @@ +Third Party Software notices and information +------------------------------------------------------------- +“Third Party Software” mean the files (if any) listed in the “third-party-programs.txt” or other similarly-named text file that may be included with the software. Third Party Software, even if included with the distribution of the software, may be governed by separate license terms, including without limitation, third party license terms, open source software notices and terms, and/or other Intel software license terms. These separate license terms solely govern your use of the Third Party Software. \ No newline at end of file diff --git a/python/llm/tpp/licenses/LICENSE-gperftools.txt b/python/llm/tpp/licenses/LICENSE-gperftools.txt new file mode 100644 index 00000000000..dc761cc96f6 --- /dev/null +++ b/python/llm/tpp/licenses/LICENSE-gperftools.txt @@ -0,0 +1,28 @@ +Copyright (c) 2005, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/python/llm/tpp/licenses/LICENSE-jemalloc.txt b/python/llm/tpp/licenses/LICENSE-jemalloc.txt new file mode 100644 index 00000000000..05ed3514dc0 --- /dev/null +++ b/python/llm/tpp/licenses/LICENSE-jemalloc.txt @@ -0,0 +1,26 @@ +Unless otherwise specified, files in the jemalloc source distribution are +subject to the following license: +-------------------------------------------------------------------------------- +Copyright (C) 2002-present Jason Evans . +All rights reserved. +Copyright (C) 2007-2012 Mozilla Foundation. All rights reserved. +Copyright (C) 2009-present Facebook, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright notice(s), + this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice(s), + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY EXPRESS +OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file