intel · Deegue · May 30, 2024 · May 30, 2024 · May 30, 2024 · May 30, 2024
diff --git a/.github/workflows/workflow_finetune_gaudi2.yml b/.github/workflows/workflow_finetune_gaudi2.yml
@@ -0,0 +1,102 @@
+name: Finetune
+
+on:
+  workflow_call:
+    inputs:
+      ci_type:
+        type: string
+        default: 'pr'
+      runner_container_image:
+        type: string
+        default: '127.0.0.1:5000/llmray-build'
+      runner_config_path:
+        type: string
+        default: '/home/ci/llm-ray-actions-runner'
+      code_checkout_path:
+        type: string
+        default: '/home/ci/actions-runner/_work/llm-on-ray/llm-on-ray'
+      model_cache_path:
+        type: string
+        default: '/scratch-2/huggingface/cache'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-ft-gaudi2
+  cancel-in-progress: true
+
+jobs:
+  finetune:
+    name: finetune
+    strategy:
+      matrix:
+        model: [ EleutherAI/gpt-j-6b, meta-llama/Llama-2-7b-chat-hf, gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b, meta-llama/Llama-2-7b-hf, mistralai/Mistral-7B-v0.1, google/gemma-2b]
+        isPR:
+          - ${{inputs.ci_type == 'pr'}}
+
+        exclude:
+          - { isPR: true }
+        include:
+          - { model: "EleutherAI/gpt-j-6b"}
+          - { model: "meta-llama/Llama-2-7b-chat-hf"}
+          - { model: "mistralai/Mistral-7B-v0.1"}
+          - { model: "google/gemma-2b"}
+
+    runs-on: gaudi2
+
+    defaults:
+      run:
+        shell: bash
+    container:
+      image: ${{ inputs.runner_container_image }}
+      env:
+        http_proxy:
+        https_proxy:
+        SHELL: bash -eo pipefail
+      volumes:
+        - /var/run/docker.sock:/var/run/docker.sock
+        - ${{ inputs.runner_config_path }}:/root/actions-runner-config
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Load environment variables
+        run: cat /root/actions-runner-config/.env >> $GITHUB_ENV
+
+      - name: Build Docker Image
+        run: |
+          DF_SUFFIX=".habana"
+          TARGET="finetune"
+          source dev/scripts/ci-functions.sh
+          build_and_prune_gaudi ${TARGET} ${DF_SUFFIX}
+
+      - name: Start Docker Container
+        run: |
+          TARGET="finetune"
+          code_checkout_path=${{ inputs.code_checkout_path }}
+          model_cache_path=${{ inputs.model_cache_path }}
+          source dev/scripts/ci-functions.sh
+          start_docker_gaudi ${TARGET} ${code_checkout_path} ${model_cache_path} ${{env.HF_ACCESS_TOKEN}}
+
+      - name: Run Finetune Test
+        run: |
+          TARGET="finetune"
+          source dev/scripts/ci-functions.sh
+          finetune_test_gaudi ${{ matrix.model }}
+
+      - name: Run PEFT-LoRA Test
+        run: |
+          source dev/scripts/ci-functions.sh
+          peft_lora_test_gaudi ${{ matrix.model }}
+
+      - name: Stop Ray
+        run: |
+          TARGET="finetune"
+          source dev/scripts/ci-functions.sh
+          stop_ray ${TARGET}
+
+      - name: Stop Container
+        if: success() || failure()
+        run: |
+          TARGET="finetune"
+          source dev/scripts/ci-functions.sh
+          stop_container ${TARGET}
diff --git a/.github/workflows/workflow_orders_on_merge.yml b/.github/workflows/workflow_orders_on_merge.yml
@@ -27,6 +27,10 @@ jobs:
   Finetune:
     needs: Lint
     uses: ./.github/workflows/workflow_finetune.yml
+
+  Finetune_Gaudi:
+    needs: Lint
+    uses: ./.github/workflows/workflow_finetune_gaudi2.yml
 
   Benchmark:
     needs: Lint

diff --git a/.github/workflows/workflow_orders_on_pr.yml b/.github/workflows/workflow_orders_on_pr.yml
@@ -27,3 +27,11 @@ jobs:
   Finetune:
     needs: Lint
     uses: ./.github/workflows/workflow_finetune.yml
+
+  Finetune_Gaudi:
+    needs: Lint
+    uses: ./.github/workflows/workflow_finetune_gaudi2.yml
+
+  Benchmark:
+    needs: Lint
+    uses: ./.github/workflows/workflow_test_benchmark.yml
diff --git a/dev/scripts/ci-functions.sh b/dev/scripts/ci-functions.sh
@@ -31,7 +31,27 @@ build_and_prune() {
     # Build Docker image and perform cleaning operation
     docker build ./ "${docker_args[@]}" -f dev/docker/Dockerfile${DF_SUFFIX} -t ${TARGET}:latest && yes | docker container prune && yes
     docker image prune -f
+}
+
+build_and_prune_gaudi() {
+    # Set TARGET and DF-SUFFIX using the passed in parameters
+    local TARGET=$1
+    local DF_SUFFIX=$2
+    local PYTHON_V=$3
+
+    docker_args=()
+    docker_args+=("--build-arg=CACHEBUST=1")
+
+    if [ -n "$PYTHON_V" ]; then
+        docker_args+=("--build-arg=python_v=${PYTHON_V}")
+    fi
+
+    echo "Build Docker image and perform cleaning operation"
+    echo "docker build ./ ${docker_args[@]} -f dev/docker/Dockerfile${DF_SUFFIX} -t ${TARGET}:habana && yes | docker container prune && yes | docker image prune -f"
 
+    # Build Docker image and perform cleaning operation
+    docker build ./ "${docker_args[@]}" -f dev/docker/Dockerfile${DF_SUFFIX} -t ${TARGET}:habana && yes | docker container prune && yes
+    docker image prune -f
 }
 
 start_docker() {
@@ -68,6 +88,41 @@ start_docker() {
     docker run -tid  "${docker_args[@]}" "${TARGET}:latest"   
 }
 
+start_docker_gaudi() {
+    local TARGET=$1
+    local code_checkout_path=$2
+    local model_cache_path=$3
+    local HF_TOKEN=$4
+
+    cid=$(docker ps -q --filter "name=${TARGET}")
+    if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
+    # check and remove exited container
+    cid=$(docker ps -a -q --filter "name=${TARGET}")
+    if [[ ! -z "$cid" ]]; then docker rm $cid; fi
+    docker ps -a
+
+    docker_args=()
+    docker_args+=("-v=${code_checkout_path}:${CODE_CHECKOUT_PATH_LOCAL}")
+
+    if [ -z "$model_cache_path" ];  then
+        echo "no cache path"
+    else
+        docker_args+=("-v=${model_cache_path}:${MODEL_CACHE_PATH_LOACL}")
+    fi
+
+    docker_args+=("--runtime=habana" )
+    docker_args+=("--name=${TARGET}" )
+    docker_args+=("--hostname=${TARGET}-container")
+
+    echo "docker run -tid  "${docker_args[@]}" "${TARGET}:habana""
+    docker run -tid  "${docker_args[@]}" "${TARGET}:habana"
+    if [ -z "$HF_TOKEN" ]; then
+        echo "no hf token"
+    else
+        docker exec "${TARGET}" bash -c "huggingface-cli login --token ${HF_TOKEN}"
+    fi
+}
+
 install_dependencies(){
     local TARGET=$1
 
@@ -225,3 +280,22 @@ peft_lora_test(){
     docker exec "finetune" bash -c "llm_on_ray-finetune --config_file llm_on_ray/finetune/finetune.yaml"
 }
 
+finetune_test_gaudi(){
+    local model=$1
+    echo Set finetune source config :
+    docker exec "finetune" bash -c "pip install --upgrade-strategy eager optimum[habana]"
+    docker exec "finetune" bash -c "source \$(python -c 'import oneccl_bindings_for_pytorch as torch_ccl;print(torch_ccl.cwd)')/env/setvars.sh; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --head --node-ip-address 127.0.0.1 --ray-debugger-external; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1  ray start --address='127.0.0.1:6379' --ray-debugger-external"
+    echo Set "${model}" patch_yaml_config :
+    docker exec "finetune" bash -c "python dev/scripts/patch_yaml_config.py --conf_path "llm_on_ray/finetune/finetune_gaudi.yaml" --models ${model} "
+    echo Stert "${model}" finetune :
+    docker exec "finetune" bash -c "llm_on_ray-finetune --config_file llm_on_ray/finetune/finetune_gaudi.yaml"
+}
+
+peft_lora_test_gaudi(){
+    local model=$1
+    docker exec "finetune" bash -c "rm -rf /tmp/llm-ray/*"
+    echo Set "${model}" patch_yaml_config :
+    docker exec "finetune" bash -c "python dev/scripts/patch_yaml_config.py --conf_path "llm_on_ray/finetune/finetune_gaudi.yaml" --models ${model} --peft_lora"
+    echo Stert "${model}" peft lora finetune :
+    docker exec "finetune" bash -c "llm_on_ray-finetune --config_file llm_on_ray/finetune/finetune_gaudi.yaml"
+}
diff --git a/llm_on_ray/finetune/finetune_gaudi.yaml b/llm_on_ray/finetune/finetune_gaudi.yaml
@@ -0,0 +1,37 @@
+General:
+  base_model: EleutherAI/gpt-j-6b
+  gpt_base_model: true
+  output_dir: /tmp/llm-ray/output
+  save_strategy: no
+  config:
+    trust_remote_code: false
+    use_auth_token: null
+  lora_config:
+    task_type: CAUSAL_LM
+    r: 8
+    lora_alpha: 32
+    lora_dropout: 0.1
+  enable_gradient_checkpointing: false
+Dataset:
+  train_file: examples/data/sample_finetune_data_small.jsonl
+  group: true
+  max_length: 512
+  block_size: 512
+  shuffle: false
+  validation_file: null
+  validation_split_percentage: 5
+Training:
+  optimizer: adamw_torch
+  batch_size: 2
+  epochs: 3
+  learning_rate: 1.0e-05
+  lr_scheduler: linear
+  weight_decay: 0.0
+  mixed_precision: bf16
+  device: hpu
+  num_training_workers: 2
+  resources_per_worker:
+    CPU: 32
+  accelerate_mode: DDP
+  gradient_accumulation_steps: 1
+  logging_steps: 10