From aaa8a50c1f16acbf5d754c7030db677f5f31f1ef Mon Sep 17 00:00:00 2001 From: Deegue Date: Thu, 30 May 2024 02:26:28 +0000 Subject: [PATCH 1/6] add finetune gaudi2 workflow --- .../workflows/workflow_finetune_gaudi2.yml | 108 ++++++++++++++++++ dev/scripts/ci-functions.sh | 87 ++++++++++++++ llm_on_ray/finetune/finetune_gaudi.yaml | 38 ++++++ 3 files changed, 233 insertions(+) create mode 100644 .github/workflows/workflow_finetune_gaudi2.yml create mode 100644 llm_on_ray/finetune/finetune_gaudi.yaml diff --git a/.github/workflows/workflow_finetune_gaudi2.yml b/.github/workflows/workflow_finetune_gaudi2.yml new file mode 100644 index 000000000..c91a7204d --- /dev/null +++ b/.github/workflows/workflow_finetune_gaudi2.yml @@ -0,0 +1,108 @@ +name: Finetune + +on: + workflow_call: + inputs: + ci_type: + type: string + default: 'pr' + runner_container_image: + type: string + default: '127.0.0.1:5000/llmray-build' + runner_config_path: + type: string + default: '/home/ci/llm-ray-actions-runner' + code_checkout_path: + type: string + default: '/home/ci/actions-runner/_work/llm-on-ray/llm-on-ray' + model_cache_path: + type: string + default: '/scratch-2/huggingface/cache' + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-ft-gaudi2 + cancel-in-progress: true + +jobs: + finetune: + name: finetune + strategy: + matrix: + model: [ EleutherAI/gpt-j-6b, meta-llama/Llama-2-7b-chat-hf, gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b, meta-llama/Llama-2-7b-hf, mistralai/Mistral-7B-v0.1, google/gemma-2b] + isPR: + - ${{inputs.ci_type == 'pr'}} + + exclude: + - { isPR: true } + include: + - { model: "EleutherAI/gpt-j-6b"} + - { model: "meta-llama/Llama-2-7b-chat-hf"} + - { model: "mistralai/Mistral-7B-v0.1"} + - { model: "google/gemma-2b"} + + runs-on: gaudi2 + + defaults: + run: + shell: bash + container: + image: ${{ inputs.runner_container_image }} + env: + http_proxy: + https_proxy: + SHELL: bash -eo pipefail + volumes: + - /var/run/docker.sock:/var/run/docker.sock + - ${{ inputs.runner_config_path }}:/root/actions-runner-config + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Load environment variables + run: cat /root/actions-runner-config/.env >> $GITHUB_ENV + + - name: Build Docker Image + run: | + DF_SUFFIX=".habana" + TARGET="finetune" + source dev/scripts/ci-functions.sh + build_and_prune_gaudi ${TARGET} ${DF_SUFFIX} + + - name: Start Docker Container + run: | + TARGET="finetune" + code_checkout_path=${{ inputs.code_checkout_path }} + model_cache_path=${{ inputs.model_cache_path }} + source dev/scripts/ci-functions.sh + start_docker_gaudi ${TARGET} ${code_checkout_path} ${model_cache_path} ${{env.HF_ACCESS_TOKEN}} + + - name: Run Finetune Test + run: | + TARGET="finetune" + source dev/scripts/ci-functions.sh + finetune_test_gaudi ${{ matrix.model }} + + - name: Run PEFT-LoRA Test + run: | + source dev/scripts/ci-functions.sh + peft_lora_test_gaudi ${{ matrix.model }} + + - name: Run Deltatuner Test on DENAS-LoRA Model + run: | + source dev/scripts/ci-functions.sh + denas_lora_test_gaudi ${{ matrix.model }} + + - name: Stop Ray + run: | + TARGET="finetune" + source dev/scripts/ci-functions.sh + stop_ray ${TARGET} + + - name: Stop Container + if: success() || failure() + run: | + TARGET="finetune" + source dev/scripts/ci-functions.sh + stop_container ${TARGET} + diff --git a/dev/scripts/ci-functions.sh b/dev/scripts/ci-functions.sh index af345c47e..213144be2 100644 --- a/dev/scripts/ci-functions.sh +++ b/dev/scripts/ci-functions.sh @@ -31,7 +31,27 @@ build_and_prune() { # Build Docker image and perform cleaning operation docker build ./ "${docker_args[@]}" -f dev/docker/Dockerfile${DF_SUFFIX} -t ${TARGET}:latest && yes | docker container prune && yes docker image prune -f +} + +build_and_prune_gaudi() { + # Set TARGET and DF-SUFFIX using the passed in parameters + local TARGET=$1 + local DF_SUFFIX=$2 + local PYTHON_V=$3 + + docker_args=() + docker_args+=("--build-arg=CACHEBUST=1") + + if [ -n "$PYTHON_V" ]; then + docker_args+=("--build-arg=python_v=${PYTHON_V}") + fi + + echo "Build Docker image and perform cleaning operation" + echo "docker build ./ ${docker_args[@]} -f dev/docker/Dockerfile${DF_SUFFIX} -t ${TARGET}:habana && yes | docker container prune && yes | docker image prune -f" + # Build Docker image and perform cleaning operation + docker build ./ "${docker_args[@]}" -f dev/docker/Dockerfile${DF_SUFFIX} -t ${TARGET}:habana && yes | docker container prune && yes + docker image prune -f } start_docker() { @@ -74,6 +94,41 @@ start_docker() { fi } +start_docker_gaudi() { + local TARGET=$1 + local code_checkout_path=$2 + local model_cache_path=$3 + local HF_TOKEN=$4 + + cid=$(docker ps -q --filter "name=${TARGET}") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi + # check and remove exited container + cid=$(docker ps -a -q --filter "name=${TARGET}") + if [[ ! -z "$cid" ]]; then docker rm $cid; fi + docker ps -a + + docker_args=() + docker_args+=("-v=${code_checkout_path}:${CODE_CHECKOUT_PATH_LOCAL}") + + if [ -z "$model_cache_path" ]; then + echo "no cache path" + else + docker_args+=("-v=${model_cache_path}:${MODEL_CACHE_PATH_LOACL}") + fi + + docker_args+=("--runtime=habana" ) + docker_args+=("--name=${TARGET}" ) + docker_args+=("--hostname=${TARGET}-container") + + echo "docker run -tid "${docker_args[@]}" "${TARGET}:latest"" + docker run -tid "${docker_args[@]}" "${TARGET}:latest" + if [ -z "$HF_TOKEN" ]; then + echo "no hf token" + else + docker exec "${TARGET}" bash -c "huggingface-cli login --token ${HF_TOKEN}" + fi +} + install_dependencies(){ local TARGET=$1 @@ -275,4 +330,36 @@ denas_lora_test(){ echo Stert "${model}" peft lora finetune : docker exec "finetune" bash -c "llm_on_ray-finetune --config_file llm_on_ray/finetune/finetune.yaml" fi +} + +finetune_test_gaudi(){ + local model=$1 + echo Set finetune source config : + docker exec "finetune" bash -c "source \$(python -c 'import oneccl_bindings_for_pytorch as torch_ccl;print(torch_ccl.cwd)')/env/setvars.sh; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --head --node-ip-address 127.0.0.1 --ray-debugger-external; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --address='127.0.0.1:6379' --ray-debugger-external" + echo Set "${model}" patch_yaml_config : + docker exec "finetune" bash -c "python dev/scripts/patch_yaml_config.py --conf_path "llm_on_ray/finetune/finetune_gaudi.yaml" --models ${model} " + echo Stert "${model}" finetune : + docker exec "finetune" bash -c "llm_on_ray-finetune --config_file llm_on_ray/finetune/finetune_gaudi.yaml" +} + +peft_lora_test_gaudi(){ + local model=$1 + docker exec "finetune" bash -c "rm -rf /tmp/llm-ray/*" + echo Set "${model}" patch_yaml_config : + docker exec "finetune" bash -c "python dev/scripts/patch_yaml_config.py --conf_path "llm_on_ray/finetune/finetune_gaudi.yaml" --models ${model} --peft_lora" + echo Stert "${model}" peft lora finetune : + docker exec "finetune" bash -c "llm_on_ray-finetune --config_file llm_on_ray/finetune/finetune_gaudi.yaml" +} + +denas_lora_test_gaudi(){ + local model=$1 + if [[ ${model} =~ ^(mosaicml\/mpt-7b|huggyllama\/llama-7b|meta-llama\/Llama-2-7b-chat-hf|mistralai\/Mistral-7B-v0.1|google\/gemma-2b)$ ]]; then + echo ${model} is not supported! + else + docker exec "finetune" bash -c "rm -rf /tmp/llm-ray/*" + echo Set "${model}" patch_yaml_config : + docker exec "finetune" bash -c "python dev/scripts/patch_yaml_config.py --conf_path "llm_on_ray/finetune/finetune_gaudi.yaml" --models ${model} --peft_lora --denas_lora" + echo Stert "${model}" peft lora finetune : + docker exec "finetune" bash -c "llm_on_ray-finetune --config_file llm_on_ray/finetune/finetune_gaudi.yaml" + fi } \ No newline at end of file diff --git a/llm_on_ray/finetune/finetune_gaudi.yaml b/llm_on_ray/finetune/finetune_gaudi.yaml new file mode 100644 index 000000000..d44c8bce8 --- /dev/null +++ b/llm_on_ray/finetune/finetune_gaudi.yaml @@ -0,0 +1,38 @@ +General: + base_model: EleutherAI/gpt-j-6b + gpt_base_model: true + output_dir: /tmp/llm-ray/output + save_strategy: no + config: + trust_remote_code: false + use_auth_token: null + lora_config: + task_type: CAUSAL_LM + r: 8 + lora_alpha: 32 + lora_dropout: 0.1 + enable_gradient_checkpointing: false +Dataset: + train_file: examples/data/sample_finetune_data_small.jsonl + group: true + max_length: 512 + block_size: 512 + shuffle: false + validation_file: null + validation_split_percentage: 5 +Training: + optimizer: adamw_torch + batch_size: 2 + epochs: 3 + learning_rate: 1.0e-05 + lr_scheduler: linear + weight_decay: 0.0 + mixed_precision: bf16 + device: hpu + num_training_workers: 2 + resources_per_worker: + CPU: 32 + accelerate_mode: DDP + gradient_accumulation_steps: 1 + logging_steps: 10 + deepspeed_config_file: finetune/ds_config_zero2.json From 50268df3c17742f39a1cbf5e2de07e493b492464 Mon Sep 17 00:00:00 2001 From: Deegue Date: Thu, 30 May 2024 02:29:50 +0000 Subject: [PATCH 2/6] add to workflow --- .github/workflows/workflow_orders_on_merge.yml | 4 ++++ .github/workflows/workflow_orders_on_pr.yml | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/.github/workflows/workflow_orders_on_merge.yml b/.github/workflows/workflow_orders_on_merge.yml index 69d72a7f8..af66377e8 100644 --- a/.github/workflows/workflow_orders_on_merge.yml +++ b/.github/workflows/workflow_orders_on_merge.yml @@ -27,3 +27,7 @@ jobs: Finetune: needs: Lint uses: ./.github/workflows/workflow_finetune.yml + + Finetune_Gaudi: + needs: Lint + uses: ./.github/workflows/workflow_finetune_gaudi2.yml diff --git a/.github/workflows/workflow_orders_on_pr.yml b/.github/workflows/workflow_orders_on_pr.yml index acec72a6d..2ea9e76ea 100644 --- a/.github/workflows/workflow_orders_on_pr.yml +++ b/.github/workflows/workflow_orders_on_pr.yml @@ -27,3 +27,7 @@ jobs: Finetune: needs: Lint uses: ./.github/workflows/workflow_finetune.yml + + Finetune_Gaudi: + needs: Lint + uses: ./.github/workflows/workflow_finetune_gaudi2.yml From df1cbc727f27987a1b5fe5dbca4d9da3d68071b9 Mon Sep 17 00:00:00 2001 From: Deegue Date: Thu, 30 May 2024 02:40:43 +0000 Subject: [PATCH 3/6] nit --- dev/scripts/ci-functions.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/scripts/ci-functions.sh b/dev/scripts/ci-functions.sh index 213144be2..e7849b596 100644 --- a/dev/scripts/ci-functions.sh +++ b/dev/scripts/ci-functions.sh @@ -120,8 +120,8 @@ start_docker_gaudi() { docker_args+=("--name=${TARGET}" ) docker_args+=("--hostname=${TARGET}-container") - echo "docker run -tid "${docker_args[@]}" "${TARGET}:latest"" - docker run -tid "${docker_args[@]}" "${TARGET}:latest" + echo "docker run -tid "${docker_args[@]}" "${TARGET}:habana"" + docker run -tid "${docker_args[@]}" "${TARGET}:habana" if [ -z "$HF_TOKEN" ]; then echo "no hf token" else From f12fe5f53a7136d6b0a145eb65ea9b0e3bbe0dc0 Mon Sep 17 00:00:00 2001 From: Deegue Date: Thu, 30 May 2024 09:09:42 +0000 Subject: [PATCH 4/6] remove deltatuner --- .github/workflows/workflow_finetune_gaudi2.yml | 6 ------ dev/scripts/ci-functions.sh | 2 +- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/.github/workflows/workflow_finetune_gaudi2.yml b/.github/workflows/workflow_finetune_gaudi2.yml index c91a7204d..a79db5b9d 100644 --- a/.github/workflows/workflow_finetune_gaudi2.yml +++ b/.github/workflows/workflow_finetune_gaudi2.yml @@ -88,11 +88,6 @@ jobs: source dev/scripts/ci-functions.sh peft_lora_test_gaudi ${{ matrix.model }} - - name: Run Deltatuner Test on DENAS-LoRA Model - run: | - source dev/scripts/ci-functions.sh - denas_lora_test_gaudi ${{ matrix.model }} - - name: Stop Ray run: | TARGET="finetune" @@ -105,4 +100,3 @@ jobs: TARGET="finetune" source dev/scripts/ci-functions.sh stop_container ${TARGET} - diff --git a/dev/scripts/ci-functions.sh b/dev/scripts/ci-functions.sh index e7849b596..96ceb6fb2 100644 --- a/dev/scripts/ci-functions.sh +++ b/dev/scripts/ci-functions.sh @@ -362,4 +362,4 @@ denas_lora_test_gaudi(){ echo Stert "${model}" peft lora finetune : docker exec "finetune" bash -c "llm_on_ray-finetune --config_file llm_on_ray/finetune/finetune_gaudi.yaml" fi -} \ No newline at end of file +} From 3ec0a3fe31b37f7297ec26acfa72fd496a4d8fbe Mon Sep 17 00:00:00 2001 From: Deegue Date: Fri, 31 May 2024 03:51:55 +0000 Subject: [PATCH 5/6] fix transformers --- dev/scripts/ci-functions.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/scripts/ci-functions.sh b/dev/scripts/ci-functions.sh index 5e2c3345e..993e71dde 100644 --- a/dev/scripts/ci-functions.sh +++ b/dev/scripts/ci-functions.sh @@ -289,6 +289,7 @@ peft_lora_test(){ finetune_test_gaudi(){ local model=$1 echo Set finetune source config : + docker exec "finetune" bash -c "pip install --upgrade-strategy eager optimum[habana]" docker exec "finetune" bash -c "source \$(python -c 'import oneccl_bindings_for_pytorch as torch_ccl;print(torch_ccl.cwd)')/env/setvars.sh; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --head --node-ip-address 127.0.0.1 --ray-debugger-external; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --address='127.0.0.1:6379' --ray-debugger-external" echo Set "${model}" patch_yaml_config : docker exec "finetune" bash -c "python dev/scripts/patch_yaml_config.py --conf_path "llm_on_ray/finetune/finetune_gaudi.yaml" --models ${model} " From c63697a10d86d833e05b6c7cdbf53b9bf3cfebf3 Mon Sep 17 00:00:00 2001 From: Deegue Date: Fri, 31 May 2024 03:52:14 +0000 Subject: [PATCH 6/6] remove deepspeed config file --- llm_on_ray/finetune/finetune_gaudi.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/llm_on_ray/finetune/finetune_gaudi.yaml b/llm_on_ray/finetune/finetune_gaudi.yaml index d44c8bce8..a972fe918 100644 --- a/llm_on_ray/finetune/finetune_gaudi.yaml +++ b/llm_on_ray/finetune/finetune_gaudi.yaml @@ -35,4 +35,3 @@ Training: accelerate_mode: DDP gradient_accumulation_steps: 1 logging_steps: 10 - deepspeed_config_file: finetune/ds_config_zero2.json