Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Finetune] Add finetune Gaudi workflow #240

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
102 changes: 102 additions & 0 deletions .github/workflows/workflow_finetune_gaudi2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
name: Finetune

on:
workflow_call:
inputs:
ci_type:
type: string
default: 'pr'
runner_container_image:
type: string
default: '127.0.0.1:5000/llmray-build'
runner_config_path:
type: string
default: '/home/ci/llm-ray-actions-runner'
code_checkout_path:
type: string
default: '/home/ci/actions-runner/_work/llm-on-ray/llm-on-ray'
model_cache_path:
type: string
default: '/scratch-2/huggingface/cache'

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-ft-gaudi2
cancel-in-progress: true

jobs:
finetune:
name: finetune
strategy:
matrix:
model: [ EleutherAI/gpt-j-6b, meta-llama/Llama-2-7b-chat-hf, gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b, meta-llama/Llama-2-7b-hf, mistralai/Mistral-7B-v0.1, google/gemma-2b]
isPR:
- ${{inputs.ci_type == 'pr'}}

exclude:
- { isPR: true }
include:
- { model: "EleutherAI/gpt-j-6b"}
- { model: "meta-llama/Llama-2-7b-chat-hf"}
- { model: "mistralai/Mistral-7B-v0.1"}
- { model: "google/gemma-2b"}

runs-on: gaudi2

defaults:
run:
shell: bash
container:
image: ${{ inputs.runner_container_image }}
env:
http_proxy:
https_proxy:
SHELL: bash -eo pipefail
volumes:
- /var/run/docker.sock:/var/run/docker.sock
- ${{ inputs.runner_config_path }}:/root/actions-runner-config

steps:
- name: Checkout
uses: actions/checkout@v4

- name: Load environment variables
run: cat /root/actions-runner-config/.env >> $GITHUB_ENV

- name: Build Docker Image
run: |
DF_SUFFIX=".habana"
TARGET="finetune"
source dev/scripts/ci-functions.sh
build_and_prune_gaudi ${TARGET} ${DF_SUFFIX}

- name: Start Docker Container
run: |
TARGET="finetune"
code_checkout_path=${{ inputs.code_checkout_path }}
model_cache_path=${{ inputs.model_cache_path }}
source dev/scripts/ci-functions.sh
start_docker_gaudi ${TARGET} ${code_checkout_path} ${model_cache_path} ${{env.HF_ACCESS_TOKEN}}

- name: Run Finetune Test
run: |
TARGET="finetune"
source dev/scripts/ci-functions.sh
finetune_test_gaudi ${{ matrix.model }}

- name: Run PEFT-LoRA Test
run: |
source dev/scripts/ci-functions.sh
peft_lora_test_gaudi ${{ matrix.model }}

- name: Stop Ray
run: |
TARGET="finetune"
source dev/scripts/ci-functions.sh
stop_ray ${TARGET}

- name: Stop Container
if: success() || failure()
run: |
TARGET="finetune"
source dev/scripts/ci-functions.sh
stop_container ${TARGET}
4 changes: 4 additions & 0 deletions .github/workflows/workflow_orders_on_merge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ jobs:
Finetune:
needs: Lint
uses: ./.github/workflows/workflow_finetune.yml

Finetune_Gaudi:
needs: Lint
uses: ./.github/workflows/workflow_finetune_gaudi2.yml

Benchmark:
needs: Lint
Expand Down
8 changes: 8 additions & 0 deletions .github/workflows/workflow_orders_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,11 @@ jobs:
Finetune:
needs: Lint
uses: ./.github/workflows/workflow_finetune.yml

Finetune_Gaudi:
needs: Lint
uses: ./.github/workflows/workflow_finetune_gaudi2.yml

Benchmark:
needs: Lint
uses: ./.github/workflows/workflow_test_benchmark.yml
74 changes: 74 additions & 0 deletions dev/scripts/ci-functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,27 @@ build_and_prune() {
# Build Docker image and perform cleaning operation
docker build ./ "${docker_args[@]}" -f dev/docker/Dockerfile${DF_SUFFIX} -t ${TARGET}:latest && yes | docker container prune && yes
docker image prune -f
}

build_and_prune_gaudi() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there is no need to duplicate code, could you merge this with build_and_prune

# Set TARGET and DF-SUFFIX using the passed in parameters
local TARGET=$1
local DF_SUFFIX=$2
local PYTHON_V=$3

docker_args=()
docker_args+=("--build-arg=CACHEBUST=1")

if [ -n "$PYTHON_V" ]; then
docker_args+=("--build-arg=python_v=${PYTHON_V}")
fi

echo "Build Docker image and perform cleaning operation"
echo "docker build ./ ${docker_args[@]} -f dev/docker/Dockerfile${DF_SUFFIX} -t ${TARGET}:habana && yes | docker container prune && yes | docker image prune -f"

# Build Docker image and perform cleaning operation
docker build ./ "${docker_args[@]}" -f dev/docker/Dockerfile${DF_SUFFIX} -t ${TARGET}:habana && yes | docker container prune && yes
docker image prune -f
}

start_docker() {
Expand Down Expand Up @@ -68,6 +88,41 @@ start_docker() {
docker run -tid "${docker_args[@]}" "${TARGET}:latest"
}

start_docker_gaudi() {
local TARGET=$1
local code_checkout_path=$2
local model_cache_path=$3
local HF_TOKEN=$4

cid=$(docker ps -q --filter "name=${TARGET}")
if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
# check and remove exited container
cid=$(docker ps -a -q --filter "name=${TARGET}")
if [[ ! -z "$cid" ]]; then docker rm $cid; fi
docker ps -a

docker_args=()
docker_args+=("-v=${code_checkout_path}:${CODE_CHECKOUT_PATH_LOCAL}")

if [ -z "$model_cache_path" ]; then
echo "no cache path"
else
docker_args+=("-v=${model_cache_path}:${MODEL_CACHE_PATH_LOACL}")
fi

docker_args+=("--runtime=habana" )
docker_args+=("--name=${TARGET}" )
docker_args+=("--hostname=${TARGET}-container")

echo "docker run -tid "${docker_args[@]}" "${TARGET}:habana""
docker run -tid "${docker_args[@]}" "${TARGET}:habana"
if [ -z "$HF_TOKEN" ]; then
echo "no hf token"
else
docker exec "${TARGET}" bash -c "huggingface-cli login --token ${HF_TOKEN}"
fi
}

install_dependencies(){
local TARGET=$1

Expand Down Expand Up @@ -225,3 +280,22 @@ peft_lora_test(){
docker exec "finetune" bash -c "llm_on_ray-finetune --config_file llm_on_ray/finetune/finetune.yaml"
}

finetune_test_gaudi(){
local model=$1
echo Set finetune source config :
docker exec "finetune" bash -c "pip install --upgrade-strategy eager optimum[habana]"
docker exec "finetune" bash -c "source \$(python -c 'import oneccl_bindings_for_pytorch as torch_ccl;print(torch_ccl.cwd)')/env/setvars.sh; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --head --node-ip-address 127.0.0.1 --ray-debugger-external; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --address='127.0.0.1:6379' --ray-debugger-external"
echo Set "${model}" patch_yaml_config :
docker exec "finetune" bash -c "python dev/scripts/patch_yaml_config.py --conf_path "llm_on_ray/finetune/finetune_gaudi.yaml" --models ${model} "
echo Stert "${model}" finetune :
docker exec "finetune" bash -c "llm_on_ray-finetune --config_file llm_on_ray/finetune/finetune_gaudi.yaml"
}

peft_lora_test_gaudi(){
local model=$1
docker exec "finetune" bash -c "rm -rf /tmp/llm-ray/*"
echo Set "${model}" patch_yaml_config :
docker exec "finetune" bash -c "python dev/scripts/patch_yaml_config.py --conf_path "llm_on_ray/finetune/finetune_gaudi.yaml" --models ${model} --peft_lora"
echo Stert "${model}" peft lora finetune :
docker exec "finetune" bash -c "llm_on_ray-finetune --config_file llm_on_ray/finetune/finetune_gaudi.yaml"
}
37 changes: 37 additions & 0 deletions llm_on_ray/finetune/finetune_gaudi.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
General:
base_model: EleutherAI/gpt-j-6b
gpt_base_model: true
output_dir: /tmp/llm-ray/output
save_strategy: no
config:
trust_remote_code: false
use_auth_token: null
lora_config:
task_type: CAUSAL_LM
r: 8
lora_alpha: 32
lora_dropout: 0.1
enable_gradient_checkpointing: false
Dataset:
train_file: examples/data/sample_finetune_data_small.jsonl
group: true
max_length: 512
block_size: 512
shuffle: false
validation_file: null
validation_split_percentage: 5
Training:
optimizer: adamw_torch
batch_size: 2
epochs: 3
learning_rate: 1.0e-05
lr_scheduler: linear
weight_decay: 0.0
mixed_precision: bf16
device: hpu
num_training_workers: 2
resources_per_worker:
CPU: 32
accelerate_mode: DDP
gradient_accumulation_steps: 1
logging_steps: 10
Loading