Skip to content

Commit

Permalink
[finetune] some fixes (#103)
Browse files Browse the repository at this point in the history
* fixes

* update

* update

* update

* remove accelrator tracking, replace by ray train.report

* update

* update

* update

* update

* update
  • Loading branch information
harborn authored Feb 23, 2024
1 parent b3ffd55 commit 535de7d
Show file tree
Hide file tree
Showing 16 changed files with 69 additions and 50 deletions.
14 changes: 8 additions & 6 deletions .github/workflows/config/update_finetune_config_on_intel_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@
import argparse


def update_finetune_config(base_model):
conf_file = "finetune/finetune.yaml"
with open(conf_file) as f:
def update_finetune_config(config_file, base_model):
with open(config_file) as f:
config = yaml.load(f, Loader=yaml.FullLoader)
# due to compute node can't connect network
# base models are downloaded as local files in directory ~/models/
Expand All @@ -23,18 +22,21 @@ def update_finetune_config(base_model):
# pythia-6.9b

config["General"]["base_model"] = base_model
# config["General"]["base_model"] = "pythia-70m"
config["General"]["output_dir"] = "./output"
config["General"]["checkpoint_dir"] = "./checkpoint"
config["Training"]["device"] = "GPU"
config["Training"]["resources_per_worker"]["CPU"] = 1
config["Training"]["resources_per_worker"]["GPU"] = 1
config["Training"]["accelerate_mode"] = "GPU_DDP"
config["Training"]["logging_steps"] = 1

with open(conf_file, "w") as f:
with open(config_file, "w") as f:
yaml.dump(config, f, sort_keys=False)


def get_parser():
parser = argparse.ArgumentParser(description="Finetuning on Intel GPU")
parser.add_argument("--config_file", type=str, required=True, default=None)
parser.add_argument("--base_model", type=str, required=True, default=None)
return parser

Expand All @@ -43,4 +45,4 @@ def get_parser():
parser = get_parser()
args = parser.parse_args()

update_finetune_config(args.base_model)
update_finetune_config(args.config_file, args.base_model)
10 changes: 5 additions & 5 deletions .github/workflows/workflow_finetune_gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,17 @@ on:
default: '10.1.2.13:5000/llmray-build'
http_proxy:
type: string
default: 'http://proxy-chain.intel.com:911'
default: 'http://10.24.221.149:911'
https_proxy:
type: string
default: 'http://proxy-chain.intel.com:911'
default: 'http://10.24.221.149:911'

jobs:
finetune:
name: finetune on gpu test
strategy:
matrix:
model: [ pythia-6.9b, gpt-j-6b ]
model: [ meta-llama/Llama-2-7b-chat-hf ]
runs-on: self-hosted

defaults:
Expand All @@ -41,6 +41,6 @@ jobs:
rm ~/borealis-runner/llm-on-ray.tar.gz -f
tar zcf ~/borealis-runner/llm-on-ray.tar.gz -C ~/actions-runner/_work/llm-on-ray .
cd ~/borealis-runner/
python3 finetune_on_pvc.py --base_model "${{ matrix.model }}"
python3 finetune_on_pvc.py --need_create_conda_env true --base_models "${{ matrix.model }}"
- name: Test Summary
run: echo "to be continued"
run: echo "to be continued"
2 changes: 1 addition & 1 deletion common/dataset/huggingface_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def __call__(self, config):
if validation_file is not None:
validation_dataset = local_load(validation_file)
return datasets.DatasetDict(
{"train": train_dataset, "validation_dataset": validation_dataset}
{"train": train_dataset, "validation": validation_dataset}
)
if validation_split_percentage / 100 > 0.0 and validation_split_percentage / 100 < 1.0:
datasets_dict = train_dataset.train_test_split(
Expand Down
18 changes: 5 additions & 13 deletions common/trainer/default_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,10 +155,10 @@ def train(self):
max_train_step = self.config.get("max_train_step")
max_eval_step = self.config.get("max_eval_step")
for idx in range(self.starting_epoch, num_train_epochs, 1):
logger.info(f"start train epoch {idx}")
self.model.train()
start = time.time()
total_steps = len(self.train_dataloader)
logger.info(f"Start training epoch {idx}, total_steps {total_steps}")
for step, batch in enumerate(self.train_dataloader):
with self.accelerator.accumulate(self.model):
outputs = self.model(**batch)
Expand All @@ -172,13 +172,14 @@ def train(self):
if step % logging_steps == 0:
loss = loss.item()
ppl = math.exp(loss)
epochs = (step + idx * total_steps) / (num_train_epochs * total_steps)
logger.info(
f"train epoch:[{idx}/{num_train_epochs}]\tstep:[{step}/{total_steps}]\tloss:{loss:.6f}\tppl:{ppl:.6f}\ttime:{time.time()-start:.6f}"
f"train epoch:{epochs:.6f}\tloss:{loss:.6f}\tppl:{ppl:.6f}\ttime:{time.time()-start:.6f}"
)
report(
{
"loss": loss,
"ppl": ppl,
"train_loss": loss,
"train_ppl": ppl,
"train_epoch": idx,
"total_epochs": num_train_epochs,
"train_step": step,
Expand All @@ -187,10 +188,6 @@ def train(self):
else total_steps,
}
)
self.accelerator.log(
{"train loss": loss, "train perplexity": ppl},
step=idx * total_steps + step,
)
start = time.time()
if max_train_step is not None:
if step >= max_train_step - 1:
Expand Down Expand Up @@ -221,9 +218,6 @@ def train(self):
except OverflowError:
eval_loss = float("inf")
perplexity = float("inf")
self.accelerator.log(
{"evaluate loss": eval_loss, "evaluate perplexity": perplexity}
)
logger.info(
f"eval epoch:[{idx}/{num_train_epochs}]\tloss:[{eval_loss:.6f}]\tppl:[{perplexity:.6f}]\ttime:[{time.time()-start:.6f}]"
)
Expand All @@ -243,8 +237,6 @@ def train(self):
)
logger.info(f"finish save model to {output}")

self.accelerator.end_training()

self.accelerator.wait_for_everyone()

def _get_local_path(self, root_path, model_name):
Expand Down
3 changes: 1 addition & 2 deletions docs/finetune_parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ The following are the parameters supported in the finetuning workflow.
|gpt_base_model|True|This parameter is for [Transformers#22482](https://github.com/huggingface/transformers/issues/22482). It needs to be set to True when the pretrained model is realted to gpt, otherwise it is False.|
|output_dir|/tmp/llm-ray/output|The output directory to store the finetuned model|
|checkpoint_dir|/tmp/llm-ray/checkpoint|The directory to store checkpoint|
|tracking_dir|/tmp/llm-ray/tracking|The path to a directory for storing logs of locally-compatible loggers|
|config|trust_remote_code: False<br> use_auth_token: None|Will be passed to the transformers `from_pretrained()` method|
|lora_config|task_type: CAUSAL_LM<br>r: 8<br>lora_alpha: 32<br>lora_dropout: 0.1|Will be passed to the LoraConfig `__init__()` method, then it'll be used as config to build Peft model object.|
|deltatuner_config|"algo": "lora"<br>"denas": True<br>"best_model_structure": "/path/to/best_structure_of_deltatuner_model"|Will be passed to the DeltaTunerArguments `__init__()` method, then it'll be used as config to build [Deltatuner model](https://github.com/intel/e2eAIOK/tree/main/e2eAIOK/deltatuner) object.|
Expand All @@ -34,7 +33,7 @@ The following are the parameters supported in the finetuning workflow.
|learning_rate|1e-5|Initial learning rate to use.|
|lr_scheduler|linear|The scheduler type to use, supported value: "linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"|
|weight_decay|0.0|Weight decay is a regularization technique that adds an L2 norm of all model weights to the loss function while increasing the probability of improving the model generalization.|
|mixed_precision|no|Whether or not to use mixed precision training. Choose from "no", "fp16", "bf16" or "fp8". Default is "no" if not set.
|mixed_precision|no|Whether or not to use mixed precision training. Choose from "no", "fp16", "bf16". Default is "no" if not set.
|device|CPU|The device type used, can be "CPU", "GPU".|
|num_training_workers|2|The number of the training process.|
|resources_per_worker|{"CPU": 32}|A dict to specify the resources for each worker. If `device` is "GPU", please set it like {"CPU": 32, "GPU": 1}.|
Expand Down
4 changes: 4 additions & 0 deletions examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ General:
r: 8
lora_alpha: 32
lora_dropout: 0.1
enable_gradient_checkpointing: false
Dataset:
train_file: examples/data/sample_finetune_data.jsonl
validation_file: null
Expand All @@ -22,9 +23,12 @@ Training:
learning_rate: 1.0e-05
lr_scheduler: linear
weight_decay: 0.0
mixed_precision: bf16
device: GPU
num_training_workers: 2
accelerate_mode: GPU_DDP
resources_per_worker:
CPU: 1
GPU: 1
gradient_accumulation_steps: 1
logging_steps: 10
31 changes: 12 additions & 19 deletions finetune/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os
import argparse
from typing import Any, Dict, Union
from typing import Any, Dict, Union, Optional

import torch
import accelerate
Expand Down Expand Up @@ -63,12 +63,13 @@ def get_accelerate_environment_variable(mode: str, config: Union[Dict[str, Any],
return mode_env_vars[mode]


def convert_dtype(dtype: str) -> torch.dtype:
supported_dtypes = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32}
if dtype in supported_dtypes:
return supported_dtypes[dtype]
else:
raise ValueError(f"only supported torch.dtype list [{supported_dtypes.keys()}]")
def convert_dtype(dtype: str) -> Optional[torch.dtype]:
supported_dtypes = {
"fp16": torch.float16,
"bf16": torch.bfloat16,
"no": None,
}
return supported_dtypes[dtype]


def train_func(config: Dict[str, Any]):
Expand All @@ -89,24 +90,14 @@ def train_func(config: Dict[str, Any]):
else:
fsdp_plugin = None

log_with = "tensorboard" # only support tensorboard as tracker
output_dir = config["General"]["output_dir"]
tracking_dir = config["General"]["tracking_dir"]
accelerator = accelerate.Accelerator(
gradient_accumulation_steps=gradient_accumulation_steps,
fsdp_plugin=fsdp_plugin,
log_with=log_with,
project_dir=tracking_dir,
)
epochs = config["Training"]["epochs"]
tracker_config = {
"epochs": epochs,
"learning_rate": config["Training"]["learning_rate"],
"batch_size": config["Training"]["batch_size"],
}
base_model = config["General"]["base_model"]
dataset_file = config["Dataset"]["train_file"]
accelerator.init_trackers("fine-tuning", config=tracker_config)

common.logger.info(
f"accelerator generate finish, accelerator device type = {accelerator.device}"
Expand Down Expand Up @@ -134,9 +125,11 @@ def train_func(config: Dict[str, Any]):
model = common.model.Model.registory.get("HuggingFaceModelForCausalLM")()(
config={
"name": base_model,
"dtype": convert_dtype(config["Training"]["mixed_precision"]),
"dtype": convert_dtype(config["Training"].get("mixed_precision", "no")),
"config": config["General"]["config"],
"enable_gradient_checkpointing": config["General"]["enable_gradient_checkpointing"],
"enable_gradient_checkpointing": config["General"].get(
"enable_gradient_checkpointing", False
),
"lora_config": config["General"]["lora_config"]
if config["General"].get("lora_config")
else None,
Expand Down
3 changes: 1 addition & 2 deletions finetune/finetune.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ General:
gpt_base_model: true
output_dir: /tmp/llm-ray/output
checkpoint_dir: /tmp/llm-ray/checkpoint
tracking_dir: /tmp/llm-ray/tracking
config:
trust_remote_code: false
use_auth_token: null
Expand All @@ -30,5 +29,5 @@ Training:
resources_per_worker:
CPU: 32
accelerate_mode: CPU_DDP
gradient_accumulation_steps: 2
gradient_accumulation_steps: 1
logging_steps: 10
10 changes: 8 additions & 2 deletions finetune/finetune_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ class General(BaseModel):
gpt_base_model: bool
output_dir: str
checkpoint_dir: str
tracking_dir: str
config: GeneralConfig
lora_config: Optional[LoraConfig] = None
deltatuner_config: Optional[DeltatunerConfig] = None
Expand Down Expand Up @@ -56,7 +55,7 @@ class Training(BaseModel):
resources_per_worker: RayResourceConfig
accelerate_mode: str
mixed_precision: str = "no"
gradient_accumulation_steps: int
gradient_accumulation_steps: int = 1
logging_steps: int = 10

@validator("device")
Expand All @@ -73,6 +72,13 @@ def check_accelerate_mode(cls, v: str):
raise ValueError(f"accelerate_mode must be one of {modes}")
return v

@validator("mixed_precision")
def check_mixed_precision(cls, v: str):
supported_precisions = ["no", "fp16", "bf16"]
if v not in supported_precisions:
raise ValueError(f"mixed_precision must be one of {supported_precisions}")
return v

@validator("logging_steps")
def check_logging_steps(cls, v: int):
assert v > 0
Expand Down
3 changes: 3 additions & 0 deletions finetune/models/bloom-560m.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ General:
r: 8
lora_alpha: 32
lora_dropout: 0.1
enable_gradient_checkpointing: false
Dataset:
train_file: examples/data/sample_finetune_data_small.jsonl
validation_file: null
Expand All @@ -28,3 +29,5 @@ Training:
resources_per_worker:
CPU: 32
accelerate_mode: CPU_DDP
gradient_accumulation_steps: 1
logging_steps: 10
3 changes: 3 additions & 0 deletions finetune/models/finetune_config_template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ General:
r: 8
lora_alpha: 32
lora_dropout: 0.1
enable_gradient_checkpointing: false
Dataset:
train_file: examples/data/sample_finetune_data_small.jsonl
validation_file: null
Expand All @@ -28,3 +29,5 @@ Training:
resources_per_worker:
CPU: 32
accelerate_mode: CPU_DDP
gradient_accumulation_steps: 1
logging_steps: 10
3 changes: 3 additions & 0 deletions finetune/models/gpt-j-6b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ General:
r: 8
lora_alpha: 32
lora_dropout: 0.1
enable_gradient_checkpointing: false
Dataset:
train_file: examples/data/sample_finetune_data_small.jsonl
validation_file: null
Expand All @@ -28,3 +29,5 @@ Training:
resources_per_worker:
CPU: 32
accelerate_mode: CPU_DDP
gradient_accumulation_steps: 1
logging_steps: 10
3 changes: 3 additions & 0 deletions finetune/models/gpt2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ General:
r: 8
lora_alpha: 32
lora_dropout: 0.1
enable_gradient_checkpointing: false
Dataset:
train_file: examples/data/sample_finetune_data_small.jsonl
validation_file: null
Expand All @@ -28,3 +29,5 @@ Training:
resources_per_worker:
CPU: 32
accelerate_mode: CPU_DDP
gradient_accumulation_steps: 1
logging_steps: 10
6 changes: 6 additions & 0 deletions finetune/models/llama-2-7b-chat-hf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ General:
r: 8
lora_alpha: 32
lora_dropout: 0.1
target_modules:
- q_proj
- v_proj
enable_gradient_checkpointing: false
Dataset:
train_file: examples/data/sample_finetune_data_small.jsonl
validation_file: null
Expand All @@ -28,3 +32,5 @@ Training:
resources_per_worker:
CPU: 32
accelerate_mode: CPU_DDP
gradient_accumulation_steps: 1
logging_steps: 10
3 changes: 3 additions & 0 deletions finetune/models/mistral-7b-v0.1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ General:
- up_proj
- down_proj
- lm_head
enable_gradient_checkpointing: false
Dataset:
train_file: examples/data/sample_finetune_data_small.jsonl
validation_file: null
Expand All @@ -37,3 +38,5 @@ Training:
resources_per_worker:
CPU: 32
accelerate_mode: CPU_DDP
gradient_accumulation_steps: 1
logging_steps: 10
3 changes: 3 additions & 0 deletions finetune/models/mpt-7b-chat.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ General:
r: 8
lora_alpha: 32
lora_dropout: 0.1
enable_gradient_checkpointing: false
Dataset:
train_file: examples/data/sample_finetune_data_small.jsonl
validation_file: null
Expand All @@ -28,3 +29,5 @@ Training:
resources_per_worker:
CPU: 32
accelerate_mode: CPU_DDP
gradient_accumulation_steps: 1
logging_steps: 10

0 comments on commit 535de7d

Please sign in to comment.