Skip to content

Commit

Permalink
Add HuggingFace Llama3.2 1B to benchmark (pytorch#5368)
Browse files Browse the repository at this point in the history
* Add compatible HuggingFace models to benchmark workflow

* Replace ones with rand to workaround the crash from sdpa kernel

---------

Co-authored-by: Guang Yang <[email protected]>
Co-authored-by: Github Executorch <[email protected]>
  • Loading branch information
3 people authored Dec 18, 2024
1 parent dabb14e commit 72bb7b7
Show file tree
Hide file tree
Showing 7 changed files with 308 additions and 23 deletions.
95 changes: 95 additions & 0 deletions .ci/scripts/download_hf_hub.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/bin/bash

# Function to download files from the Hugging Face Hub
# Arguments:
# 1. model_id: The Hugging Face repository ID (e.g., "organization/model_name")
# 2. subdir: The optional subdirectory in the repo to look for files (pass "" if not used)
# 3. file_names: A space-separated list of filenames to be downloaded
# Returns:
# The directory containing the downloaded files
function download_hf_files() {
local model_id="$1"
local subdir="$2"
shift 2
local file_names=("$@") # Capture all remaining arguments as an array

local download_dir

# Use the first file to determine the download directory
download_dir=$(python3 -c "
from huggingface_hub import hf_hub_download
# Download the first file and get its directory
path = hf_hub_download(
repo_id='${model_id}',
filename='${subdir:+${subdir}/}${file_names[0]}'
)
import os
print(os.path.dirname(path))")

if [ $? -ne 0 ]; then
echo "Error: Failed to determine download directory from ${file_names[0]}" >&2
return 1
fi

# Download remaining files into the same directory
for file_name in "${file_names[@]:1}"; do
python3 -c "
from huggingface_hub import hf_hub_download
# Download the file
hf_hub_download(
repo_id='${model_id}',
filename='${subdir:+${subdir}/}${file_name}'
)"

if [ $? -ne 0 ]; then
echo "Error: Failed to download ${file_name} from ${model_id}" >&2
return 1
fi
done

# Return the directory containing the downloaded files
echo "$download_dir"
}

# Check if script is called directly
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
# Parse arguments from CLI
while [[ $# -gt 0 ]]; do
case $1 in
--model_id)
MODEL_ID="$2"
shift 2
;;
--subdir)
SUBDIR="$2"
shift 2
;;
--files)
shift
FILES_TO_DOWNLOAD=()
while [[ $# -gt 0 && $1 != --* ]]; do
FILES_TO_DOWNLOAD+=("$1")
shift
done
;;
*)
echo "Unknown option: $1" >&2
exit 1
;;
esac
done

# Validate required arguments
if [ -z "$MODEL_ID" ] || [ ${#FILES_TO_DOWNLOAD[@]} -eq 0 ]; then
echo "Usage: $0 --model_id <model_id> --subdir <subdir> --files <file1> [<file2> ...]" >&2
exit 1
fi

# Call the function
DOWNLOAD_DIR=$(download_hf_files "$MODEL_ID" "$SUBDIR" "${FILES_TO_DOWNLOAD[@]}")
if [ $? -eq 0 ]; then
echo "$DOWNLOAD_DIR"
else
exit 1
fi
fi
105 changes: 102 additions & 3 deletions .github/workflows/android-perf.yml
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ jobs:
declare -A DEVICE_POOL_ARNS
DEVICE_POOL_ARNS[samsung_galaxy_s22]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa"
DEVICE_POOL_ARNS[samsung_galaxy_s24]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db"
DEVICE_POOL_ARNS[google_pixel_8_pro]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a"
# Resolve device names with their corresponding ARNs
if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then
Expand Down Expand Up @@ -168,18 +169,20 @@ jobs:
name: export-models
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
needs: set-parameters
secrets: inherit
strategy:
matrix:
model: ${{ fromJson(needs.set-parameters.outputs.models) }}
delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
fail-fast: false
with:
runner: linux.4xlarge
runner: linux.2xlarge.memory
docker-image: executorch-ubuntu-22.04-qnn-sdk
submodules: 'true'
timeout: 60
upload-artifact: android-models
upload-artifact-to-s3: true
secrets-env: EXECUTORCH_HF_TOKEN
script: |
# The generic Linux job chooses to use base env, not the one setup by the image
echo "::group::Setting up dev environment"
Expand All @@ -190,14 +193,109 @@ jobs:
PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
fi
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
# Install requirements for export_llama
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }}
echo "::endgroup::"
echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}"
BUILD_MODE="cmake"
DTYPE="fp32"
if [[ ${{ matrix.model }} =~ ^stories* ]]; then
if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then
pip install -U "huggingface_hub[cli]"
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
pip install accelerate sentencepiece
# HuggingFace model. Assume the pattern is always like "<org>/<repo>"
HF_MODEL_REPO=${{ matrix.model }}
OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.delegate }}"
if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
# Llama models on Hugging Face
if [[ "$HF_MODEL_REPO" == *"SpinQuant"* ]]; then
# SpinQuant
# Download prequantized chceckpoint from Hugging Face
DOWNLOADED_PATH=$(
bash .ci/scripts/download_hf_hub.sh \
--model_id "${HF_MODEL_REPO}" \
--files "tokenizer.model" "params.json" "consolidated.00.pth"
)
# Export using ExecuTorch's model definition
python -m examples.models.llama.export_llama \
--model "llama3_2" \
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
--params "${DOWNLOADED_PATH}/params.json" \
--use_sdpa_with_kv_cache \
-X \
--xnnpack-extended-ops \
--preq_mode 8da4w_output_8da8w \
--preq_group_size 32 \
--max_seq_length 2048 \
--output_name "${OUT_ET_MODEL_NAME}.pte" \
-kv \
-d fp32 \
--preq_embedding_quantize 8,0 \
--use_spin_quant native \
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ "$HF_MODEL_REPO" == *"QLORA"* ]]; then
# QAT + LoRA
# Download prequantized chceckpoint from Hugging Face
DOWNLOADED_PATH=$(
bash .ci/scripts/download_hf_hub.sh \
--model_id "${HF_MODEL_REPO}" \
--files "tokenizer.model" "params.json" "consolidated.00.pth"
)
# Export using ExecuTorch's model definition
python -m examples.models.llama.export_llama \
--model "llama3_2" \
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
--params "${DOWNLOADED_PATH}/params.json" \
-qat \
-lora 16 \
--preq_mode 8da4w_output_8da8w \
--preq_group_size 32 \
--preq_embedding_quantize 8,0 \
--use_sdpa_with_kv_cache \
-kv \
-X \
--xnnpack-extended-ops \
-d fp32 \
--max_seq_length 2048 \
--output_name "${OUT_ET_MODEL_NAME}.pte" \
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
ls -lh "${OUT_ET_MODEL_NAME}.pte"
else
if [[ ${{ matrix.delegate }} == "xnnpack_bf16" ]]; then
# Original BF16 version, without any quantization
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
python -m examples.models.llama.export_llama \
--model "llama3_2" \
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
--params "${DOWNLOADED_PATH}/params.json" \
-kv \
--use_sdpa_with_kv_cache \
-X \
-d bf16 \
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
--output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
else
# By default, test with the Hugging Face model and the xnnpack recipe
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
fi
fi
else
echo "Unsupported model ${{ matrix.model }}"
exit 1
fi
zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model"
ls -lh model.zip
mkdir -p "${ARTIFACTS_DIR_NAME}"
mv model.zip "${ARTIFACTS_DIR_NAME}"
elif [[ ${{ matrix.model }} =~ ^stories* ]]; then
# Install requirements for export_llama
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
# Test llama2
Expand All @@ -209,6 +307,7 @@ jobs:
echo "Unsupported delegate ${{ matrix.delegate }}"
exit 1
fi
DTYPE="fp32"
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh \
-model "${{ matrix.model }}" \
-build_tool "${BUILD_MODE}" \
Expand Down
103 changes: 101 additions & 2 deletions .github/workflows/apple-perf.yml
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ jobs:
name: export-models
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
needs: set-parameters
secrets: inherit
strategy:
matrix:
model: ${{ fromJson(needs.set-parameters.outputs.models) }}
Expand All @@ -168,6 +169,7 @@ jobs:
timeout: 60
upload-artifact: ios-models
upload-artifact-to-s3: true
secrets-env: EXECUTORCH_HF_TOKEN
script: |
set -eux
Expand All @@ -189,14 +191,110 @@ jobs:
backends/apple/mps/install_requirements.sh
fi
# Install requirements for export_llama
PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama/install_requirements.sh
ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }}
echo "::endgroup::"
echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}"
BUILD_MODE="cmake"
DTYPE="fp32"
if [[ ${{ matrix.model }} =~ ^stories* ]]; then
if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then
pip install -U "huggingface_hub[cli]"
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
${CONDA_RUN} pip install accelerate sentencepiece
# HuggingFace model. Assume the pattern is always like "<org>/<repo>"
HF_MODEL_REPO=${{ matrix.model }}
OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.delegate }}"
if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
# Llama models on Hugging Face
if [[ "$HF_MODEL_REPO" == *"SpinQuant"* ]]; then
# SpinQuant
# Download prequantized chceckpoint from Hugging Face
DOWNLOADED_PATH=$(
bash .ci/scripts/download_hf_hub.sh \
--model_id "${HF_MODEL_REPO}" \
--files "tokenizer.model" "params.json" "consolidated.00.pth"
)
# Export using ExecuTorch's model definition
${CONDA_RUN} python -m examples.models.llama.export_llama \
--model "llama3_2" \
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
--params "${DOWNLOADED_PATH}/params.json" \
--use_sdpa_with_kv_cache \
-X \
--xnnpack-extended-ops \
--preq_mode 8da4w_output_8da8w \
--preq_group_size 32 \
--max_seq_length 2048 \
--output_name "${OUT_ET_MODEL_NAME}.pte" \
-kv \
-d fp32 \
--preq_embedding_quantize 8,0 \
--use_spin_quant native \
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ "$HF_MODEL_REPO" == *"QLORA"* ]]; then
# QAT + LoRA
# Download prequantized chceckpoint from Hugging Face
DOWNLOADED_PATH=$(
bash .ci/scripts/download_hf_hub.sh \
--model_id "${HF_MODEL_REPO}" \
--files "tokenizer.model" "params.json" "consolidated.00.pth"
)
# Export using ExecuTorch's model definition
${CONDA_RUN} python -m examples.models.llama.export_llama \
--model "llama3_2" \
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
--params "${DOWNLOADED_PATH}/params.json" \
-qat \
-lora 16 \
--preq_mode 8da4w_output_8da8w \
--preq_group_size 32 \
--preq_embedding_quantize 8,0 \
--use_sdpa_with_kv_cache \
-kv \
-X \
--xnnpack-extended-ops \
-d fp32 \
--max_seq_length 2048 \
--output_name "${OUT_ET_MODEL_NAME}.pte" \
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
ls -lh "${OUT_ET_MODEL_NAME}.pte"
else
if [[ ${{ matrix.delegate }} == "xnnpack_bf16" ]]; then
# Original BF16 version, without any quantization
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
${CONDA_RUN} python -m examples.models.llama.export_llama \
--model "llama3_2" \
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
--params "${DOWNLOADED_PATH}/params.json" \
-kv \
--use_sdpa_with_kv_cache \
-X \
-d bf16 \
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
--output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
else
# By default, test with the Hugging Face model and the xnnpack recipe
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
${CONDA_RUN} python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
fi
fi
else
echo "Unsupported model ${{ matrix.model }}"
exit 1
fi
zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model"
ls -lh model.zip
mkdir -p "${ARTIFACTS_DIR_NAME}"
mv model.zip "${ARTIFACTS_DIR_NAME}"
elif [[ ${{ matrix.model }} =~ ^stories* ]]; then
# Install requirements for export_llama
PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
bash examples/models/llama/install_requirements.sh
Expand All @@ -209,6 +307,7 @@ jobs:
elif [[ ${{ matrix.delegate }} == "mps" ]]; then
DELEGATE_CONFIG="mps"
fi
DTYPE="fp32"
PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
bash .ci/scripts/test_llama.sh \
-model "${{ matrix.model }}" \
Expand Down
Loading

0 comments on commit 72bb7b7

Please sign in to comment.