From aaa8a50c1f16acbf5d754c7030db677f5f31f1ef Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Thu, 30 May 2024 02:26:28 +0000
Subject: [PATCH 1/6] add finetune gaudi2 workflow

---
 .../workflows/workflow_finetune_gaudi2.yml    | 108 ++++++++++++++++++
 dev/scripts/ci-functions.sh                   |  87 ++++++++++++++
 llm_on_ray/finetune/finetune_gaudi.yaml       |  38 ++++++
 3 files changed, 233 insertions(+)
 create mode 100644 .github/workflows/workflow_finetune_gaudi2.yml
 create mode 100644 llm_on_ray/finetune/finetune_gaudi.yaml

diff --git a/.github/workflows/workflow_finetune_gaudi2.yml b/.github/workflows/workflow_finetune_gaudi2.yml
new file mode 100644
index 000000000..c91a7204d
--- /dev/null
+++ b/.github/workflows/workflow_finetune_gaudi2.yml
@@ -0,0 +1,108 @@
+name: Finetune
+
+on:
+  workflow_call:
+    inputs:
+      ci_type:
+        type: string
+        default: 'pr'
+      runner_container_image:
+        type: string
+        default: '127.0.0.1:5000/llmray-build'
+      runner_config_path:
+        type: string
+        default: '/home/ci/llm-ray-actions-runner'
+      code_checkout_path:
+        type: string
+        default: '/home/ci/actions-runner/_work/llm-on-ray/llm-on-ray'
+      model_cache_path:
+        type: string
+        default: '/scratch-2/huggingface/cache'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-ft-gaudi2
+  cancel-in-progress: true
+
+jobs:
+  finetune:
+    name: finetune
+    strategy:
+      matrix:
+        model: [ EleutherAI/gpt-j-6b, meta-llama/Llama-2-7b-chat-hf, gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b, meta-llama/Llama-2-7b-hf, mistralai/Mistral-7B-v0.1, google/gemma-2b]
+        isPR:
+          - ${{inputs.ci_type == 'pr'}}
+
+        exclude:
+          - { isPR: true }
+        include:
+          - { model: "EleutherAI/gpt-j-6b"}
+          - { model: "meta-llama/Llama-2-7b-chat-hf"}
+          - { model: "mistralai/Mistral-7B-v0.1"}
+          - { model: "google/gemma-2b"}
+
+    runs-on: gaudi2
+
+    defaults:
+      run:
+        shell: bash
+    container:
+      image: ${{ inputs.runner_container_image }}
+      env:
+        http_proxy:
+        https_proxy:
+        SHELL: bash -eo pipefail
+      volumes:
+        - /var/run/docker.sock:/var/run/docker.sock
+        - ${{ inputs.runner_config_path }}:/root/actions-runner-config
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Load environment variables
+        run: cat /root/actions-runner-config/.env >> $GITHUB_ENV
+
+      - name: Build Docker Image
+        run: |
+          DF_SUFFIX=".habana"
+          TARGET="finetune"
+          source dev/scripts/ci-functions.sh
+          build_and_prune_gaudi ${TARGET} ${DF_SUFFIX}
+
+      - name: Start Docker Container
+        run: |
+          TARGET="finetune"
+          code_checkout_path=${{ inputs.code_checkout_path }}
+          model_cache_path=${{ inputs.model_cache_path }}
+          source dev/scripts/ci-functions.sh
+          start_docker_gaudi ${TARGET} ${code_checkout_path} ${model_cache_path} ${{env.HF_ACCESS_TOKEN}}
+
+      - name: Run Finetune Test
+        run: |
+          TARGET="finetune"
+          source dev/scripts/ci-functions.sh
+          finetune_test_gaudi ${{ matrix.model }}
+
+      - name: Run PEFT-LoRA Test
+        run: |
+          source dev/scripts/ci-functions.sh
+          peft_lora_test_gaudi ${{ matrix.model }}
+
+      - name: Run Deltatuner Test on DENAS-LoRA Model
+        run: |
+          source dev/scripts/ci-functions.sh
+          denas_lora_test_gaudi ${{ matrix.model }}
+
+      - name: Stop Ray
+        run: |
+          TARGET="finetune"
+          source dev/scripts/ci-functions.sh
+          stop_ray ${TARGET}
+
+      - name: Stop Container
+        if: success() || failure()
+        run: |
+          TARGET="finetune"
+          source dev/scripts/ci-functions.sh
+          stop_container ${TARGET}
+
diff --git a/dev/scripts/ci-functions.sh b/dev/scripts/ci-functions.sh
index af345c47e..213144be2 100644
--- a/dev/scripts/ci-functions.sh
+++ b/dev/scripts/ci-functions.sh
@@ -31,7 +31,27 @@ build_and_prune() {
     # Build Docker image and perform cleaning operation
     docker build ./ "${docker_args[@]}" -f dev/docker/Dockerfile${DF_SUFFIX} -t ${TARGET}:latest && yes | docker container prune && yes
     docker image prune -f
+}
+
+build_and_prune_gaudi() {
+    # Set TARGET and DF-SUFFIX using the passed in parameters
+    local TARGET=$1
+    local DF_SUFFIX=$2
+    local PYTHON_V=$3
+
+    docker_args=()
+    docker_args+=("--build-arg=CACHEBUST=1")
+
+    if [ -n "$PYTHON_V" ]; then
+        docker_args+=("--build-arg=python_v=${PYTHON_V}")
+    fi
+
+    echo "Build Docker image and perform cleaning operation"
+    echo "docker build ./ ${docker_args[@]} -f dev/docker/Dockerfile${DF_SUFFIX} -t ${TARGET}:habana && yes | docker container prune && yes | docker image prune -f"
 
+    # Build Docker image and perform cleaning operation
+    docker build ./ "${docker_args[@]}" -f dev/docker/Dockerfile${DF_SUFFIX} -t ${TARGET}:habana && yes | docker container prune && yes
+    docker image prune -f
 }
 
 start_docker() {
@@ -74,6 +94,41 @@ start_docker() {
     fi
 }
 
+start_docker_gaudi() {
+    local TARGET=$1
+    local code_checkout_path=$2
+    local model_cache_path=$3
+    local HF_TOKEN=$4
+
+    cid=$(docker ps -q --filter "name=${TARGET}")
+    if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
+    # check and remove exited container
+    cid=$(docker ps -a -q --filter "name=${TARGET}")
+    if [[ ! -z "$cid" ]]; then docker rm $cid; fi
+    docker ps -a
+
+    docker_args=()
+    docker_args+=("-v=${code_checkout_path}:${CODE_CHECKOUT_PATH_LOCAL}")
+
+    if [ -z "$model_cache_path" ];  then
+        echo "no cache path"
+    else
+        docker_args+=("-v=${model_cache_path}:${MODEL_CACHE_PATH_LOACL}")
+    fi
+
+    docker_args+=("--runtime=habana" )
+    docker_args+=("--name=${TARGET}" )
+    docker_args+=("--hostname=${TARGET}-container")
+
+    echo "docker run -tid  "${docker_args[@]}" "${TARGET}:latest""
+    docker run -tid  "${docker_args[@]}" "${TARGET}:latest"
+    if [ -z "$HF_TOKEN" ]; then
+        echo "no hf token"
+    else
+        docker exec "${TARGET}" bash -c "huggingface-cli login --token ${HF_TOKEN}"
+    fi
+}
+
 install_dependencies(){
     local TARGET=$1
 
@@ -275,4 +330,36 @@ denas_lora_test(){
         echo Stert "${model}" peft lora finetune :
         docker exec "finetune" bash -c "llm_on_ray-finetune --config_file llm_on_ray/finetune/finetune.yaml"
     fi
+}
+
+finetune_test_gaudi(){
+    local model=$1
+    echo Set finetune source config :
+    docker exec "finetune" bash -c "source \$(python -c 'import oneccl_bindings_for_pytorch as torch_ccl;print(torch_ccl.cwd)')/env/setvars.sh; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --head --node-ip-address 127.0.0.1 --ray-debugger-external; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1  ray start --address='127.0.0.1:6379' --ray-debugger-external"
+    echo Set "${model}" patch_yaml_config :
+    docker exec "finetune" bash -c "python dev/scripts/patch_yaml_config.py --conf_path "llm_on_ray/finetune/finetune_gaudi.yaml" --models ${model} "
+    echo Stert "${model}" finetune :
+    docker exec "finetune" bash -c "llm_on_ray-finetune --config_file llm_on_ray/finetune/finetune_gaudi.yaml"
+}
+
+peft_lora_test_gaudi(){
+    local model=$1
+    docker exec "finetune" bash -c "rm -rf /tmp/llm-ray/*"
+    echo Set "${model}" patch_yaml_config :
+    docker exec "finetune" bash -c "python dev/scripts/patch_yaml_config.py --conf_path "llm_on_ray/finetune/finetune_gaudi.yaml" --models ${model} --peft_lora"
+    echo Stert "${model}" peft lora finetune :
+    docker exec "finetune" bash -c "llm_on_ray-finetune --config_file llm_on_ray/finetune/finetune_gaudi.yaml"
+}
+
+denas_lora_test_gaudi(){
+    local model=$1
+    if [[ ${model} =~ ^(mosaicml\/mpt-7b|huggyllama\/llama-7b|meta-llama\/Llama-2-7b-chat-hf|mistralai\/Mistral-7B-v0.1|google\/gemma-2b)$ ]]; then
+        echo ${model} is not supported!
+    else
+        docker exec "finetune" bash -c "rm -rf /tmp/llm-ray/*"
+        echo Set "${model}" patch_yaml_config :
+        docker exec "finetune" bash -c "python dev/scripts/patch_yaml_config.py --conf_path "llm_on_ray/finetune/finetune_gaudi.yaml" --models ${model} --peft_lora --denas_lora"
+        echo Stert "${model}" peft lora finetune :
+        docker exec "finetune" bash -c "llm_on_ray-finetune --config_file llm_on_ray/finetune/finetune_gaudi.yaml"
+    fi
 }
\ No newline at end of file
diff --git a/llm_on_ray/finetune/finetune_gaudi.yaml b/llm_on_ray/finetune/finetune_gaudi.yaml
new file mode 100644
index 000000000..d44c8bce8
--- /dev/null
+++ b/llm_on_ray/finetune/finetune_gaudi.yaml
@@ -0,0 +1,38 @@
+General:
+  base_model: EleutherAI/gpt-j-6b
+  gpt_base_model: true
+  output_dir: /tmp/llm-ray/output
+  save_strategy: no
+  config:
+    trust_remote_code: false
+    use_auth_token: null
+  lora_config:
+    task_type: CAUSAL_LM
+    r: 8
+    lora_alpha: 32
+    lora_dropout: 0.1
+  enable_gradient_checkpointing: false
+Dataset:
+  train_file: examples/data/sample_finetune_data_small.jsonl
+  group: true
+  max_length: 512
+  block_size: 512
+  shuffle: false
+  validation_file: null
+  validation_split_percentage: 5
+Training:
+  optimizer: adamw_torch
+  batch_size: 2
+  epochs: 3
+  learning_rate: 1.0e-05
+  lr_scheduler: linear
+  weight_decay: 0.0
+  mixed_precision: bf16
+  device: hpu
+  num_training_workers: 2
+  resources_per_worker:
+    CPU: 32
+  accelerate_mode: DDP
+  gradient_accumulation_steps: 1
+  logging_steps: 10
+  deepspeed_config_file: finetune/ds_config_zero2.json

From 50268df3c17742f39a1cbf5e2de07e493b492464 Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Thu, 30 May 2024 02:29:50 +0000
Subject: [PATCH 2/6] add to workflow

---
 .github/workflows/workflow_orders_on_merge.yml | 4 ++++
 .github/workflows/workflow_orders_on_pr.yml    | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/.github/workflows/workflow_orders_on_merge.yml b/.github/workflows/workflow_orders_on_merge.yml
index 69d72a7f8..af66377e8 100644
--- a/.github/workflows/workflow_orders_on_merge.yml
+++ b/.github/workflows/workflow_orders_on_merge.yml
@@ -27,3 +27,7 @@ jobs:
   Finetune:
     needs: Lint
     uses: ./.github/workflows/workflow_finetune.yml
+
+  Finetune_Gaudi:
+    needs: Lint
+    uses: ./.github/workflows/workflow_finetune_gaudi2.yml
diff --git a/.github/workflows/workflow_orders_on_pr.yml b/.github/workflows/workflow_orders_on_pr.yml
index acec72a6d..2ea9e76ea 100644
--- a/.github/workflows/workflow_orders_on_pr.yml
+++ b/.github/workflows/workflow_orders_on_pr.yml
@@ -27,3 +27,7 @@ jobs:
   Finetune:
     needs: Lint
     uses: ./.github/workflows/workflow_finetune.yml
+
+  Finetune_Gaudi:
+    needs: Lint
+    uses: ./.github/workflows/workflow_finetune_gaudi2.yml

From df1cbc727f27987a1b5fe5dbca4d9da3d68071b9 Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Thu, 30 May 2024 02:40:43 +0000
Subject: [PATCH 3/6] nit

---
 dev/scripts/ci-functions.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev/scripts/ci-functions.sh b/dev/scripts/ci-functions.sh
index 213144be2..e7849b596 100644
--- a/dev/scripts/ci-functions.sh
+++ b/dev/scripts/ci-functions.sh
@@ -120,8 +120,8 @@ start_docker_gaudi() {
     docker_args+=("--name=${TARGET}" )
     docker_args+=("--hostname=${TARGET}-container")
 
-    echo "docker run -tid  "${docker_args[@]}" "${TARGET}:latest""
-    docker run -tid  "${docker_args[@]}" "${TARGET}:latest"
+    echo "docker run -tid  "${docker_args[@]}" "${TARGET}:habana""
+    docker run -tid  "${docker_args[@]}" "${TARGET}:habana"
     if [ -z "$HF_TOKEN" ]; then
         echo "no hf token"
     else

From f12fe5f53a7136d6b0a145eb65ea9b0e3bbe0dc0 Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Thu, 30 May 2024 09:09:42 +0000
Subject: [PATCH 4/6] remove deltatuner

---
 .github/workflows/workflow_finetune_gaudi2.yml | 6 ------
 dev/scripts/ci-functions.sh                    | 2 +-
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/.github/workflows/workflow_finetune_gaudi2.yml b/.github/workflows/workflow_finetune_gaudi2.yml
index c91a7204d..a79db5b9d 100644
--- a/.github/workflows/workflow_finetune_gaudi2.yml
+++ b/.github/workflows/workflow_finetune_gaudi2.yml
@@ -88,11 +88,6 @@ jobs:
           source dev/scripts/ci-functions.sh
           peft_lora_test_gaudi ${{ matrix.model }}
 
-      - name: Run Deltatuner Test on DENAS-LoRA Model
-        run: |
-          source dev/scripts/ci-functions.sh
-          denas_lora_test_gaudi ${{ matrix.model }}
-
       - name: Stop Ray
         run: |
           TARGET="finetune"
@@ -105,4 +100,3 @@ jobs:
           TARGET="finetune"
           source dev/scripts/ci-functions.sh
           stop_container ${TARGET}
-
diff --git a/dev/scripts/ci-functions.sh b/dev/scripts/ci-functions.sh
index e7849b596..96ceb6fb2 100644
--- a/dev/scripts/ci-functions.sh
+++ b/dev/scripts/ci-functions.sh
@@ -362,4 +362,4 @@ denas_lora_test_gaudi(){
         echo Stert "${model}" peft lora finetune :
         docker exec "finetune" bash -c "llm_on_ray-finetune --config_file llm_on_ray/finetune/finetune_gaudi.yaml"
     fi
-}
\ No newline at end of file
+}

From 3ec0a3fe31b37f7297ec26acfa72fd496a4d8fbe Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Fri, 31 May 2024 03:51:55 +0000
Subject: [PATCH 5/6] fix transformers

---
 dev/scripts/ci-functions.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dev/scripts/ci-functions.sh b/dev/scripts/ci-functions.sh
index 5e2c3345e..993e71dde 100644
--- a/dev/scripts/ci-functions.sh
+++ b/dev/scripts/ci-functions.sh
@@ -289,6 +289,7 @@ peft_lora_test(){
 finetune_test_gaudi(){
     local model=$1
     echo Set finetune source config :
+    docker exec "finetune" bash -c "pip install --upgrade-strategy eager optimum[habana]"
     docker exec "finetune" bash -c "source \$(python -c 'import oneccl_bindings_for_pytorch as torch_ccl;print(torch_ccl.cwd)')/env/setvars.sh; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --head --node-ip-address 127.0.0.1 --ray-debugger-external; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1  ray start --address='127.0.0.1:6379' --ray-debugger-external"
     echo Set "${model}" patch_yaml_config :
     docker exec "finetune" bash -c "python dev/scripts/patch_yaml_config.py --conf_path "llm_on_ray/finetune/finetune_gaudi.yaml" --models ${model} "

From c63697a10d86d833e05b6c7cdbf53b9bf3cfebf3 Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Fri, 31 May 2024 03:52:14 +0000
Subject: [PATCH 6/6] remove deepspeed config file

---
 llm_on_ray/finetune/finetune_gaudi.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llm_on_ray/finetune/finetune_gaudi.yaml b/llm_on_ray/finetune/finetune_gaudi.yaml
index d44c8bce8..a972fe918 100644
--- a/llm_on_ray/finetune/finetune_gaudi.yaml
+++ b/llm_on_ray/finetune/finetune_gaudi.yaml
@@ -35,4 +35,3 @@ Training:
   accelerate_mode: DDP
   gradient_accumulation_steps: 1
   logging_steps: 10
-  deepspeed_config_file: finetune/ds_config_zero2.json