From 72bb7b7835d9fddf572f462929e7004b740c1b63 Mon Sep 17 00:00:00 2001
From: Guang Yang <42389959+guangy10@users.noreply.github.com>
Date: Tue, 17 Dec 2024 16:32:07 -0800
Subject: [PATCH] Add HuggingFace Llama3.2 1B to benchmark (#5368)

* Add compatible HuggingFace models to benchmark workflow

* Replace ones with rand to workaround the crash from sdpa kernel

---------

Co-authored-by: Guang Yang <guangyang@fb.com>
Co-authored-by: Github Executorch <github_executorch@arm.com>
---
 .ci/scripts/download_hf_hub.sh                |  95 ++++++++++++++++
 .github/workflows/android-perf.yml            | 105 +++++++++++++++++-
 .github/workflows/apple-perf.yml              | 103 ++++++++++++++++-
 .github/workflows/trunk.yml                   |  19 +---
 .../apple/Benchmark/Tests/GenericTests.mm     |   2 +-
 .../apple/Benchmark/Tests/LLaMA/LLaMATests.mm |   4 +-
 ...lt-ios-device-farm-appium-test-spec.yml.j2 |   3 +-
 7 files changed, 308 insertions(+), 23 deletions(-)
 create mode 100644 .ci/scripts/download_hf_hub.sh
diff --git a/.ci/scripts/download_hf_hub.sh b/.ci/scripts/download_hf_hub.sh
new file mode 100644
index 0000000000..b47fc5dd21
--- /dev/null
+++ b/.ci/scripts/download_hf_hub.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+
+# Function to download files from the Hugging Face Hub
+# Arguments:
+# 1. model_id: The Hugging Face repository ID (e.g., "organization/model_name")
+# 2. subdir: The optional subdirectory in the repo to look for files (pass "" if not used)
+# 3. file_names: A space-separated list of filenames to be downloaded
+# Returns:
+# The directory containing the downloaded files
+function download_hf_files() {
+  local model_id="$1"
+  local subdir="$2"
+  shift 2
+  local file_names=("$@")  # Capture all remaining arguments as an array
+
+  local download_dir
+
+  # Use the first file to determine the download directory
+  download_dir=$(python3 -c "
+from huggingface_hub import hf_hub_download
+# Download the first file and get its directory
+path = hf_hub_download(
+    repo_id='${model_id}',
+    filename='${subdir:+${subdir}/}${file_names[0]}'
+)
+import os
+print(os.path.dirname(path))")
+
+  if [ $? -ne 0 ]; then
+    echo "Error: Failed to determine download directory from ${file_names[0]}" >&2
+    return 1
+  fi
+
+  # Download remaining files into the same directory
+  for file_name in "${file_names[@]:1}"; do
+    python3 -c "
+from huggingface_hub import hf_hub_download
+# Download the file
+hf_hub_download(
+    repo_id='${model_id}',
+    filename='${subdir:+${subdir}/}${file_name}'
+)"
+
+    if [ $? -ne 0 ]; then
+      echo "Error: Failed to download ${file_name} from ${model_id}" >&2
+      return 1
+    fi
+  done
+
+  # Return the directory containing the downloaded files
+  echo "$download_dir"
+}
+
+# Check if script is called directly
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  # Parse arguments from CLI
+  while [[ $# -gt 0 ]]; do
+    case $1 in
+      --model_id)
+        MODEL_ID="$2"
+        shift 2
+        ;;
+      --subdir)
+        SUBDIR="$2"
+        shift 2
+        ;;
+      --files)
+        shift
+        FILES_TO_DOWNLOAD=()
+        while [[ $# -gt 0 && $1 != --* ]]; do
+          FILES_TO_DOWNLOAD+=("$1")
+          shift
+        done
+        ;;
+      *)
+        echo "Unknown option: $1" >&2
+        exit 1
+        ;;
+    esac
+  done
+
+  # Validate required arguments
+  if [ -z "$MODEL_ID" ] || [ ${#FILES_TO_DOWNLOAD[@]} -eq 0 ]; then
+    echo "Usage: $0 --model_id <model_id> --subdir <subdir> --files <file1> [<file2> ...]" >&2
+    exit 1
+  fi
+
+  # Call the function
+  DOWNLOAD_DIR=$(download_hf_files "$MODEL_ID" "$SUBDIR" "${FILES_TO_DOWNLOAD[@]}")
+  if [ $? -eq 0 ]; then
+    echo "$DOWNLOAD_DIR"
+  else
+    exit 1
+  fi
+fi
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
index fce17e85a9..8dbdecbee7 100644
--- a/.github/workflows/android-perf.yml
+++ b/.github/workflows/android-perf.yml
@@ -108,6 +108,7 @@ jobs:
           declare -A DEVICE_POOL_ARNS
           DEVICE_POOL_ARNS[samsung_galaxy_s22]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa"
           DEVICE_POOL_ARNS[samsung_galaxy_s24]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db"
+          DEVICE_POOL_ARNS[google_pixel_8_pro]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a"
 
           # Resolve device names with their corresponding ARNs
           if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then
@@ -168,18 +169,20 @@ jobs:
     name: export-models
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     needs: set-parameters
+    secrets: inherit
     strategy:
       matrix:
           model: ${{ fromJson(needs.set-parameters.outputs.models) }}
           delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
       fail-fast: false
     with:
-      runner: linux.4xlarge
+      runner: linux.2xlarge.memory
       docker-image: executorch-ubuntu-22.04-qnn-sdk
       submodules: 'true'
       timeout: 60
       upload-artifact: android-models
       upload-artifact-to-s3: true
+      secrets-env: EXECUTORCH_HF_TOKEN
       script: |
         # The generic Linux job chooses to use base env, not the one setup by the image
         echo "::group::Setting up dev environment"
@@ -190,14 +193,109 @@ jobs:
             PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
         fi
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
+        # Install requirements for export_llama
+        PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
         ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }}
         echo "::endgroup::"
 
         echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}"
         BUILD_MODE="cmake"
-        DTYPE="fp32"
 
-        if [[ ${{ matrix.model }} =~ ^stories* ]]; then
+        if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then
+            pip install -U "huggingface_hub[cli]"
+            huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+            pip install accelerate sentencepiece
+            # HuggingFace model. Assume the pattern is always like "<org>/<repo>"
+            HF_MODEL_REPO=${{ matrix.model }}
+            OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.delegate }}"
+
+            if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
+                # Llama models on Hugging Face
+                if [[ "$HF_MODEL_REPO" == *"SpinQuant"* ]]; then
+                    # SpinQuant
+                    # Download prequantized chceckpoint from Hugging Face
+                    DOWNLOADED_PATH=$(
+                      bash .ci/scripts/download_hf_hub.sh \
+                        --model_id "${HF_MODEL_REPO}" \
+                        --files "tokenizer.model" "params.json" "consolidated.00.pth"
+                    )
+                    # Export using ExecuTorch's model definition
+                    python -m examples.models.llama.export_llama \
+                      --model "llama3_2" \
+                      --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
+                      --params "${DOWNLOADED_PATH}/params.json" \
+                      --use_sdpa_with_kv_cache \
+                      -X \
+                      --xnnpack-extended-ops \
+                      --preq_mode 8da4w_output_8da8w \
+                      --preq_group_size 32 \
+                      --max_seq_length 2048 \
+                      --output_name "${OUT_ET_MODEL_NAME}.pte" \
+                      -kv \
+                      -d fp32 \
+                      --preq_embedding_quantize 8,0 \
+                      --use_spin_quant native \
+                      --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+                    ls -lh "${OUT_ET_MODEL_NAME}.pte"
+                elif [[ "$HF_MODEL_REPO" == *"QLORA"* ]]; then
+                    # QAT + LoRA
+                    # Download prequantized chceckpoint from Hugging Face
+                    DOWNLOADED_PATH=$(
+                      bash .ci/scripts/download_hf_hub.sh \
+                        --model_id "${HF_MODEL_REPO}" \
+                        --files "tokenizer.model" "params.json" "consolidated.00.pth"
+                    )
+                    # Export using ExecuTorch's model definition
+                    python -m examples.models.llama.export_llama \
+                      --model "llama3_2" \
+                      --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
+                      --params "${DOWNLOADED_PATH}/params.json" \
+                      -qat \
+                      -lora 16 \
+                      --preq_mode 8da4w_output_8da8w \
+                      --preq_group_size 32 \
+                      --preq_embedding_quantize 8,0 \
+                      --use_sdpa_with_kv_cache \
+                      -kv \
+                      -X \
+                      --xnnpack-extended-ops \
+                      -d fp32 \
+                      --max_seq_length 2048 \
+                      --output_name "${OUT_ET_MODEL_NAME}.pte" \
+                      --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+                    ls -lh "${OUT_ET_MODEL_NAME}.pte"
+                else
+                    if [[ ${{ matrix.delegate }} == "xnnpack_bf16" ]]; then
+                        # Original BF16 version, without any quantization
+                        DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
+                        python -m examples.models.llama.export_llama \
+                          --model "llama3_2" \
+                          --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
+                          --params "${DOWNLOADED_PATH}/params.json" \
+                          -kv \
+                          --use_sdpa_with_kv_cache \
+                          -X \
+                          -d bf16 \
+                          --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+                          --output_name="${OUT_ET_MODEL_NAME}.pte"
+                        ls -lh "${OUT_ET_MODEL_NAME}.pte"
+                    else
+                        # By default, test with the Hugging Face model and the xnnpack recipe
+                        DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
+                        python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
+                        ls -lh "${OUT_ET_MODEL_NAME}.pte"
+                    fi
+                fi
+            else
+                echo "Unsupported model ${{ matrix.model }}"
+                exit 1
+            fi
+
+            zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model"
+            ls -lh model.zip
+            mkdir -p "${ARTIFACTS_DIR_NAME}"
+            mv model.zip "${ARTIFACTS_DIR_NAME}"
+        elif [[ ${{ matrix.model }} =~ ^stories* ]]; then
             # Install requirements for export_llama
             PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
             # Test llama2
@@ -209,6 +307,7 @@ jobs:
                 echo "Unsupported delegate ${{ matrix.delegate }}"
                 exit 1
             fi
+            DTYPE="fp32"
             PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh \
               -model "${{ matrix.model }}" \
               -build_tool "${BUILD_MODE}" \
diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml
index 394c148cf1..f2a897f72f 100644
--- a/.github/workflows/apple-perf.yml
+++ b/.github/workflows/apple-perf.yml
@@ -155,6 +155,7 @@ jobs:
     name: export-models
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     needs: set-parameters
+    secrets: inherit
     strategy:
       matrix:
           model: ${{ fromJson(needs.set-parameters.outputs.models) }}
@@ -168,6 +169,7 @@ jobs:
       timeout: 60
       upload-artifact: ios-models
       upload-artifact-to-s3: true
+      secrets-env: EXECUTORCH_HF_TOKEN
       script: |
         set -eux
 
@@ -189,14 +191,110 @@ jobs:
             backends/apple/mps/install_requirements.sh
         fi
 
+        # Install requirements for export_llama
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama/install_requirements.sh
+
         ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }}
         echo "::endgroup::"
 
         echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}"
         BUILD_MODE="cmake"
-        DTYPE="fp32"
 
-        if [[ ${{ matrix.model }} =~ ^stories* ]]; then
+        if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then
+          pip install -U "huggingface_hub[cli]"
+          huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+          ${CONDA_RUN} pip install accelerate sentencepiece
+          # HuggingFace model. Assume the pattern is always like "<org>/<repo>"
+          HF_MODEL_REPO=${{ matrix.model }}
+          OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.delegate }}"
+
+          if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
+            # Llama models on Hugging Face
+            if [[ "$HF_MODEL_REPO" == *"SpinQuant"* ]]; then
+              # SpinQuant
+              # Download prequantized chceckpoint from Hugging Face
+              DOWNLOADED_PATH=$(
+                bash .ci/scripts/download_hf_hub.sh \
+                  --model_id "${HF_MODEL_REPO}" \
+                  --files "tokenizer.model" "params.json" "consolidated.00.pth"
+              )
+              # Export using ExecuTorch's model definition
+              ${CONDA_RUN} python -m examples.models.llama.export_llama \
+                --model "llama3_2" \
+                --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
+                --params "${DOWNLOADED_PATH}/params.json" \
+                --use_sdpa_with_kv_cache \
+                -X \
+                --xnnpack-extended-ops \
+                --preq_mode 8da4w_output_8da8w \
+                --preq_group_size 32 \
+                --max_seq_length 2048 \
+                --output_name "${OUT_ET_MODEL_NAME}.pte" \
+                -kv \
+                -d fp32 \
+                --preq_embedding_quantize 8,0 \
+                --use_spin_quant native \
+                --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+              ls -lh "${OUT_ET_MODEL_NAME}.pte"
+            elif [[ "$HF_MODEL_REPO" == *"QLORA"* ]]; then
+              # QAT + LoRA
+              # Download prequantized chceckpoint from Hugging Face
+              DOWNLOADED_PATH=$(
+                bash .ci/scripts/download_hf_hub.sh \
+                  --model_id "${HF_MODEL_REPO}" \
+                  --files "tokenizer.model" "params.json" "consolidated.00.pth"
+              )
+              # Export using ExecuTorch's model definition
+              ${CONDA_RUN} python -m examples.models.llama.export_llama \
+                --model "llama3_2" \
+                --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
+                --params "${DOWNLOADED_PATH}/params.json" \
+                -qat \
+                -lora 16 \
+                --preq_mode 8da4w_output_8da8w \
+                --preq_group_size 32 \
+                --preq_embedding_quantize 8,0 \
+                --use_sdpa_with_kv_cache \
+                -kv \
+                -X \
+                --xnnpack-extended-ops \
+                -d fp32 \
+                --max_seq_length 2048 \
+                --output_name "${OUT_ET_MODEL_NAME}.pte" \
+                --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+              ls -lh "${OUT_ET_MODEL_NAME}.pte"
+            else
+              if [[ ${{ matrix.delegate }} == "xnnpack_bf16" ]]; then
+                # Original BF16 version, without any quantization
+                DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
+                ${CONDA_RUN} python -m examples.models.llama.export_llama \
+                  --model "llama3_2" \
+                  --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
+                  --params "${DOWNLOADED_PATH}/params.json" \
+                  -kv \
+                  --use_sdpa_with_kv_cache \
+                  -X \
+                  -d bf16 \
+                  --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+                  --output_name="${OUT_ET_MODEL_NAME}.pte"
+                ls -lh "${OUT_ET_MODEL_NAME}.pte"
+              else
+                # By default, test with the Hugging Face model and the xnnpack recipe
+                DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
+                ${CONDA_RUN} python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
+                ls -lh "${OUT_ET_MODEL_NAME}.pte"
+              fi
+            fi
+          else
+            echo "Unsupported model ${{ matrix.model }}"
+            exit 1
+          fi
+
+          zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model"
+          ls -lh model.zip
+          mkdir -p "${ARTIFACTS_DIR_NAME}"
+          mv model.zip "${ARTIFACTS_DIR_NAME}"
+        elif [[ ${{ matrix.model }} =~ ^stories* ]]; then
           # Install requirements for export_llama
           PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
             bash examples/models/llama/install_requirements.sh
@@ -209,6 +307,7 @@ jobs:
           elif [[ ${{ matrix.delegate }} == "mps" ]]; then
             DELEGATE_CONFIG="mps"
           fi
+          DTYPE="fp32"
           PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
             bash .ci/scripts/test_llama.sh \
               -model "${{ matrix.model }}" \
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 365c7564fe..7972269e92 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -358,11 +358,11 @@ jobs:
     secrets: inherit
     strategy:
       matrix:
-        hf_model_repo: [google/gemma-2b]
+        hf_model_repo: [google/gemma-2-2b]
       fail-fast: false
     with:
       secrets-env: EXECUTORCH_HF_TOKEN
-      runner: linux.12xlarge
+      runner: linux.2xlarge.memory
       docker-image: executorch-ubuntu-22.04-clang12
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
@@ -420,19 +420,10 @@ jobs:
         TOKENIZER_FILE=tokenizer.model
         TOKENIZER_BIN_FILE=tokenizer.bin
         ET_MODEL_NAME=et_model
-        # Fetch the file using a Python one-liner
-        DOWNLOADED_TOKENIZER_FILE_PATH=$(python -c "
-        from huggingface_hub import hf_hub_download
-        # Download the file from the Hugging Face Hub
-        downloaded_path = hf_hub_download(
-            repo_id='${{ matrix.hf_model_repo }}',
-            filename='${TOKENIZER_FILE}'
-        )
-        print(downloaded_path)
-        ")
-        if [ -f "$DOWNLOADED_TOKENIZER_FILE_PATH" ]; then
+        DOWNLOADED_TOKENIZER_FILE_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${{ matrix.hf_model_repo }}" --files "${TOKENIZER_FILE}")
+        if [ -f "$DOWNLOADED_TOKENIZER_FILE_PATH/$TOKENIZER_FILE" ]; then
             echo "${TOKENIZER_FILE} downloaded successfully at: $DOWNLOADED_TOKENIZER_FILE_PATH"
-            python -m extension.llm.tokenizer.tokenizer -t $DOWNLOADED_TOKENIZER_FILE_PATH -o ./${TOKENIZER_BIN_FILE}
+            python -m extension.llm.tokenizer.tokenizer -t "$DOWNLOADED_TOKENIZER_FILE_PATH/$TOKENIZER_FILE" -o ./${TOKENIZER_BIN_FILE}
             ls ./tokenizer.bin
         else
             echo "Failed to download ${TOKENIZER_FILE} from ${{ matrix.hf_model_repo }}."
diff --git a/extension/benchmark/apple/Benchmark/Tests/GenericTests.mm b/extension/benchmark/apple/Benchmark/Tests/GenericTests.mm
index f6c6927e78..9972091819 100644
--- a/extension/benchmark/apple/Benchmark/Tests/GenericTests.mm
+++ b/extension/benchmark/apple/Benchmark/Tests/GenericTests.mm
@@ -78,7 +78,7 @@ @implementation GenericTests
 
           const auto sizes = tensor_meta->sizes();
           tensors.emplace_back(
-              ones({sizes.begin(), sizes.end()}, tensor_meta->scalar_type()));
+              rand({sizes.begin(), sizes.end()}, tensor_meta->scalar_type()));
           XCTAssertEqual(module->set_input(tensors.back(), index), Error::Ok);
         } break;
         default:
diff --git a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm b/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm
index c03ad14517..16c1c1c1d6 100644
--- a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm
+++ b/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm
@@ -63,7 +63,7 @@ @implementation LLaMATests
       return [filename hasSuffix:@".pte"] && [filename containsString:@"llama"];
     },
     @"tokenizer" : ^BOOL(NSString *filename) {
-      return [filename isEqual:@"tokenizer.bin"];
+      return [filename isEqual:@"tokenizer.bin"] || [filename isEqual:@"tokenizer.model"];
     },
   };
 }
@@ -87,7 +87,7 @@ @implementation LLaMATests
                               tokensPerSecondMetric.tokenCount = 0;
                               const auto status = runner->generate(
                                   "Once upon a time",
-                                  128,
+                                  50,
                                   [=](const std::string &token) {
                                     tokensPerSecondMetric.tokenCount++;
                                   },
diff --git a/extension/benchmark/apple/Benchmark/default-ios-device-farm-appium-test-spec.yml.j2 b/extension/benchmark/apple/Benchmark/default-ios-device-farm-appium-test-spec.yml.j2
index dc610437fc..68f8399f16 100644
--- a/extension/benchmark/apple/Benchmark/default-ios-device-farm-appium-test-spec.yml.j2
+++ b/extension/benchmark/apple/Benchmark/default-ios-device-farm-appium-test-spec.yml.j2
@@ -19,8 +19,9 @@ phases:
 
       # Copy the model
       - mkdir -p /tmp/Payload/Benchmark.app/aatp/data
-      - cp *.bin /tmp/Payload/Benchmark.app/aatp/data
+      - cp tokenizer.* /tmp/Payload/Benchmark.app/aatp/data
       - cp *.pte /tmp/Payload/Benchmark.app/aatp/data
+      - ls -all /tmp/Payload/Benchmark.app/aatp/data
 
       - mkdir $DEVICEFARM_TEST_PACKAGE_PATH/Debug-iphoneos
       - mkdir $DEVICEFARM_TEST_PACKAGE_PATH/Release-iphoneos