Add HuggingFace Llama3.2 1B to benchmark (pytorch#5368)

* Add compatible HuggingFace models to benchmark workflow * Replace ones with rand to workaround the crash from sdpa kernel --------- Co-authored-by: Guang Yang <[email protected]> Co-authored-by: Github Executorch <[email protected]>
kirklandsign · Dec 18, 2024 · 72bb7b7 · 72bb7b7
1 parent dabb14e
commit 72bb7b7
Show file tree

Hide file tree

Showing 7 changed files with 308 additions and 23 deletions.
diff --git a/.ci/scripts/download_hf_hub.sh b/.ci/scripts/download_hf_hub.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+
+# Function to download files from the Hugging Face Hub
+# Arguments:
+# 1. model_id: The Hugging Face repository ID (e.g., "organization/model_name")
+# 2. subdir: The optional subdirectory in the repo to look for files (pass "" if not used)
+# 3. file_names: A space-separated list of filenames to be downloaded
+# Returns:
+# The directory containing the downloaded files
+function download_hf_files() {
+  local model_id="$1"
+  local subdir="$2"
+  shift 2
+  local file_names=("$@")  # Capture all remaining arguments as an array
+
+  local download_dir
+
+  # Use the first file to determine the download directory
+  download_dir=$(python3 -c "
+from huggingface_hub import hf_hub_download
+# Download the first file and get its directory
+path = hf_hub_download(
+    repo_id='${model_id}',
+    filename='${subdir:+${subdir}/}${file_names[0]}'
+)
+import os
+print(os.path.dirname(path))")
+
+  if [ $? -ne 0 ]; then
+    echo "Error: Failed to determine download directory from ${file_names[0]}" >&2
+    return 1
+  fi
+
+  # Download remaining files into the same directory
+  for file_name in "${file_names[@]:1}"; do
+    python3 -c "
+from huggingface_hub import hf_hub_download
+# Download the file
+hf_hub_download(
+    repo_id='${model_id}',
+    filename='${subdir:+${subdir}/}${file_name}'
+)"
+
+    if [ $? -ne 0 ]; then
+      echo "Error: Failed to download ${file_name} from ${model_id}" >&2
+      return 1
+    fi
+  done
+
+  # Return the directory containing the downloaded files
+  echo "$download_dir"
+}
+
+# Check if script is called directly
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  # Parse arguments from CLI
+  while [[ $# -gt 0 ]]; do
+    case $1 in
+      --model_id)
+        MODEL_ID="$2"
+        shift 2
+        ;;
+      --subdir)
+        SUBDIR="$2"
+        shift 2
+        ;;
+      --files)
+        shift
+        FILES_TO_DOWNLOAD=()
+        while [[ $# -gt 0 && $1 != --* ]]; do
+          FILES_TO_DOWNLOAD+=("$1")
+          shift
+        done
+        ;;
+      *)
+        echo "Unknown option: $1" >&2
+        exit 1
+        ;;
+    esac
+  done
+
+  # Validate required arguments
+  if [ -z "$MODEL_ID" ] || [ ${#FILES_TO_DOWNLOAD[@]} -eq 0 ]; then
+    echo "Usage: $0 --model_id <model_id> --subdir <subdir> --files <file1> [<file2> ...]" >&2
+    exit 1
+  fi
+
+  # Call the function
+  DOWNLOAD_DIR=$(download_hf_files "$MODEL_ID" "$SUBDIR" "${FILES_TO_DOWNLOAD[@]}")
+  if [ $? -eq 0 ]; then
+    echo "$DOWNLOAD_DIR"
+  else
+    exit 1
+  fi
+fi
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
@@ -108,6 +108,7 @@ jobs:
           declare -A DEVICE_POOL_ARNS
           DEVICE_POOL_ARNS[samsung_galaxy_s22]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa"
           DEVICE_POOL_ARNS[samsung_galaxy_s24]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db"
+          DEVICE_POOL_ARNS[google_pixel_8_pro]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a"
 
           # Resolve device names with their corresponding ARNs
           if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then
@@ -168,18 +169,20 @@ jobs:
     name: export-models
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     needs: set-parameters
+    secrets: inherit
     strategy:
       matrix:
           model: ${{ fromJson(needs.set-parameters.outputs.models) }}
           delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
       fail-fast: false
     with:
-      runner: linux.4xlarge
+      runner: linux.2xlarge.memory
       docker-image: executorch-ubuntu-22.04-qnn-sdk
       submodules: 'true'
       timeout: 60
       upload-artifact: android-models
       upload-artifact-to-s3: true
+      secrets-env: EXECUTORCH_HF_TOKEN
       script: |
         # The generic Linux job chooses to use base env, not the one setup by the image
         echo "::group::Setting up dev environment"
@@ -190,14 +193,109 @@ jobs:
             PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
         fi
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
+        # Install requirements for export_llama
+        PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
         ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }}
         echo "::endgroup::"
 
         echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}"
         BUILD_MODE="cmake"
-        DTYPE="fp32"
 
-        if [[ ${{ matrix.model }} =~ ^stories* ]]; then
+        if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then
+            pip install -U "huggingface_hub[cli]"
+            huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+            pip install accelerate sentencepiece
+            # HuggingFace model. Assume the pattern is always like "<org>/<repo>"
+            HF_MODEL_REPO=${{ matrix.model }}
+            OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.delegate }}"
+
+            if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
+                # Llama models on Hugging Face
+                if [[ "$HF_MODEL_REPO" == *"SpinQuant"* ]]; then
+                    # SpinQuant
+                    # Download prequantized chceckpoint from Hugging Face
+                    DOWNLOADED_PATH=$(
+                      bash .ci/scripts/download_hf_hub.sh \
+                        --model_id "${HF_MODEL_REPO}" \
+                        --files "tokenizer.model" "params.json" "consolidated.00.pth"
+                    )
+                    # Export using ExecuTorch's model definition
+                    python -m examples.models.llama.export_llama \
+                      --model "llama3_2" \
+                      --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
+                      --params "${DOWNLOADED_PATH}/params.json" \
+                      --use_sdpa_with_kv_cache \
+                      -X \
+                      --xnnpack-extended-ops \
+                      --preq_mode 8da4w_output_8da8w \
+                      --preq_group_size 32 \
+                      --max_seq_length 2048 \
+                      --output_name "${OUT_ET_MODEL_NAME}.pte" \
+                      -kv \
+                      -d fp32 \
+                      --preq_embedding_quantize 8,0 \
+                      --use_spin_quant native \
+                      --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+                    ls -lh "${OUT_ET_MODEL_NAME}.pte"
+                elif [[ "$HF_MODEL_REPO" == *"QLORA"* ]]; then
+                    # QAT + LoRA
+                    # Download prequantized chceckpoint from Hugging Face
+                    DOWNLOADED_PATH=$(
+                      bash .ci/scripts/download_hf_hub.sh \
+                        --model_id "${HF_MODEL_REPO}" \
+                        --files "tokenizer.model" "params.json" "consolidated.00.pth"
+                    )
+                    # Export using ExecuTorch's model definition
+                    python -m examples.models.llama.export_llama \
+                      --model "llama3_2" \
+                      --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
+                      --params "${DOWNLOADED_PATH}/params.json" \
+                      -qat \
+                      -lora 16 \
+                      --preq_mode 8da4w_output_8da8w \
+                      --preq_group_size 32 \
+                      --preq_embedding_quantize 8,0 \
+                      --use_sdpa_with_kv_cache \
+                      -kv \
+                      -X \
+                      --xnnpack-extended-ops \
+                      -d fp32 \
+                      --max_seq_length 2048 \
+                      --output_name "${OUT_ET_MODEL_NAME}.pte" \
+                      --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+                    ls -lh "${OUT_ET_MODEL_NAME}.pte"
+                else
+                    if [[ ${{ matrix.delegate }} == "xnnpack_bf16" ]]; then
+                        # Original BF16 version, without any quantization
+                        DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
+                        python -m examples.models.llama.export_llama \
+                          --model "llama3_2" \
+                          --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
+                          --params "${DOWNLOADED_PATH}/params.json" \
+                          -kv \
+                          --use_sdpa_with_kv_cache \
+                          -X \
+                          -d bf16 \
+                          --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+                          --output_name="${OUT_ET_MODEL_NAME}.pte"
+                        ls -lh "${OUT_ET_MODEL_NAME}.pte"
+                    else
+                        # By default, test with the Hugging Face model and the xnnpack recipe
+                        DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
+                        python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
+                        ls -lh "${OUT_ET_MODEL_NAME}.pte"
+                    fi
+                fi
+            else
+                echo "Unsupported model ${{ matrix.model }}"
+                exit 1
+            fi
+
+            zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model"
+            ls -lh model.zip
+            mkdir -p "${ARTIFACTS_DIR_NAME}"
+            mv model.zip "${ARTIFACTS_DIR_NAME}"
+        elif [[ ${{ matrix.model }} =~ ^stories* ]]; then
             # Install requirements for export_llama
             PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
             # Test llama2
@@ -209,6 +307,7 @@ jobs:
                 echo "Unsupported delegate ${{ matrix.delegate }}"
                 exit 1
             fi
+            DTYPE="fp32"
             PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh \
               -model "${{ matrix.model }}" \
               -build_tool "${BUILD_MODE}" \

diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml
@@ -155,6 +155,7 @@ jobs:
     name: export-models
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     needs: set-parameters
+    secrets: inherit
     strategy:
       matrix:
           model: ${{ fromJson(needs.set-parameters.outputs.models) }}
@@ -168,6 +169,7 @@ jobs:
       timeout: 60
       upload-artifact: ios-models
       upload-artifact-to-s3: true
+      secrets-env: EXECUTORCH_HF_TOKEN
       script: |
         set -eux
 
@@ -189,14 +191,110 @@ jobs:
             backends/apple/mps/install_requirements.sh
         fi
 
+        # Install requirements for export_llama
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama/install_requirements.sh
+
         ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }}
         echo "::endgroup::"
 
         echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}"
         BUILD_MODE="cmake"
-        DTYPE="fp32"
 
-        if [[ ${{ matrix.model }} =~ ^stories* ]]; then
+        if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then
+          pip install -U "huggingface_hub[cli]"
+          huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+          ${CONDA_RUN} pip install accelerate sentencepiece
+          # HuggingFace model. Assume the pattern is always like "<org>/<repo>"
+          HF_MODEL_REPO=${{ matrix.model }}
+          OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.delegate }}"
+
+          if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
+            # Llama models on Hugging Face
+            if [[ "$HF_MODEL_REPO" == *"SpinQuant"* ]]; then
+              # SpinQuant
+              # Download prequantized chceckpoint from Hugging Face
+              DOWNLOADED_PATH=$(
+                bash .ci/scripts/download_hf_hub.sh \
+                  --model_id "${HF_MODEL_REPO}" \
+                  --files "tokenizer.model" "params.json" "consolidated.00.pth"
+              )
+              # Export using ExecuTorch's model definition
+              ${CONDA_RUN} python -m examples.models.llama.export_llama \
+                --model "llama3_2" \
+                --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
+                --params "${DOWNLOADED_PATH}/params.json" \
+                --use_sdpa_with_kv_cache \
+                -X \
+                --xnnpack-extended-ops \
+                --preq_mode 8da4w_output_8da8w \
+                --preq_group_size 32 \
+                --max_seq_length 2048 \
+                --output_name "${OUT_ET_MODEL_NAME}.pte" \
+                -kv \
+                -d fp32 \
+                --preq_embedding_quantize 8,0 \
+                --use_spin_quant native \
+                --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+              ls -lh "${OUT_ET_MODEL_NAME}.pte"
+            elif [[ "$HF_MODEL_REPO" == *"QLORA"* ]]; then
+              # QAT + LoRA
+              # Download prequantized chceckpoint from Hugging Face
+              DOWNLOADED_PATH=$(
+                bash .ci/scripts/download_hf_hub.sh \
+                  --model_id "${HF_MODEL_REPO}" \
+                  --files "tokenizer.model" "params.json" "consolidated.00.pth"
+              )
+              # Export using ExecuTorch's model definition
+              ${CONDA_RUN} python -m examples.models.llama.export_llama \
+                --model "llama3_2" \
+                --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
+                --params "${DOWNLOADED_PATH}/params.json" \
+                -qat \
+                -lora 16 \
+                --preq_mode 8da4w_output_8da8w \
+                --preq_group_size 32 \
+                --preq_embedding_quantize 8,0 \
+                --use_sdpa_with_kv_cache \
+                -kv \
+                -X \
+                --xnnpack-extended-ops \
+                -d fp32 \
+                --max_seq_length 2048 \
+                --output_name "${OUT_ET_MODEL_NAME}.pte" \
+                --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+              ls -lh "${OUT_ET_MODEL_NAME}.pte"
+            else
+              if [[ ${{ matrix.delegate }} == "xnnpack_bf16" ]]; then
+                # Original BF16 version, without any quantization
+                DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
+                ${CONDA_RUN} python -m examples.models.llama.export_llama \
+                  --model "llama3_2" \
+                  --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
+                  --params "${DOWNLOADED_PATH}/params.json" \
+                  -kv \
+                  --use_sdpa_with_kv_cache \
+                  -X \
+                  -d bf16 \
+                  --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+                  --output_name="${OUT_ET_MODEL_NAME}.pte"
+                ls -lh "${OUT_ET_MODEL_NAME}.pte"
+              else
+                # By default, test with the Hugging Face model and the xnnpack recipe
+                DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
+                ${CONDA_RUN} python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
+                ls -lh "${OUT_ET_MODEL_NAME}.pte"
+              fi
+            fi
+          else
+            echo "Unsupported model ${{ matrix.model }}"
+            exit 1
+          fi
+
+          zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model"
+          ls -lh model.zip
+          mkdir -p "${ARTIFACTS_DIR_NAME}"
+          mv model.zip "${ARTIFACTS_DIR_NAME}"
+        elif [[ ${{ matrix.model }} =~ ^stories* ]]; then
           # Install requirements for export_llama
           PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
             bash examples/models/llama/install_requirements.sh
@@ -209,6 +307,7 @@ jobs:
           elif [[ ${{ matrix.delegate }} == "mps" ]]; then
             DELEGATE_CONFIG="mps"
           fi
+          DTYPE="fp32"
           PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
             bash .ci/scripts/test_llama.sh \
               -model "${{ matrix.model }}" \