Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
Signed-off-by: minmingzhu <[email protected]>
  • Loading branch information
minmingzhu committed Jul 2, 2024
1 parent 05fdf80 commit fb3152e
Show file tree
Hide file tree
Showing 7 changed files with 40 additions and 221 deletions.
1 change: 0 additions & 1 deletion dev/scripts/patch_yaml_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ def patch_yaml_config():
result["Training"]["beta"] = 0.1
result["Training"]["finetuning_model"]["dpo"] = True


with open(conf_path, "w") as output:
yaml.dump(result, output, sort_keys=False)

Expand Down
159 changes: 0 additions & 159 deletions llm_on_ray/finetune/data_preprocess.py

This file was deleted.

24 changes: 14 additions & 10 deletions llm_on_ray/finetune/data_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,16 +228,20 @@ def __init__(self, config, tokenizer):
self.config = config

def make_prompt(self, examples):
return {
"prompt": " ".join(
[
system + question
for system, question in zip(examples["system"], examples["question"])
]
),
"chosen": examples["chosen"],
"rejected": examples["rejected"],
}
prompts = {}
prompts["prompt"] = []
prompts["chosen"] = []
prompts["rejected"] = []

for rec in examples:
prompts["prompt"].append(
" ".join(
[system + question for system, question in zip(rec["system"], rec["question"])]
)
)
prompts["chosen"].append(rec["chosen"])
prompts["rejected"].append(rec["rejected"])
return prompts

"""
Copied from https://github.com/intel/intel-extension-for-transformers/blob/5ba5fa8048b63bec8a3be8a7122a3db8344ad065/
Expand Down
7 changes: 1 addition & 6 deletions llm_on_ray/finetune/dpo_finetuing.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,13 @@
class DPOFineTuning(Finetuning):
def tokenize_dataset(self, config: Dict, tokenizer, dataset):
processor = DPOIntelOrcaProcessor(config, tokenizer)

print(dataset)
for key in dataset:
prompts = processor.make_prompt(dataset[key])
dataset[key] = datasets.Dataset.from_dict(prompts)

train_dataset = dataset["train"]
column_names = list(train_dataset.features)
(
processor.tokenize_by_neural_chat
if config["Dataset"].get("data_preprocess_type", "neural_chat") == "neural_chat"
else processor.tokenize
)
if train_dataset is not None:
# Create train feature from dataset
train_dataset = train_dataset.map(
Expand Down
7 changes: 1 addition & 6 deletions llm_on_ray/finetune/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,9 @@
from pydantic_yaml import parse_yaml_raw_as

from llm_on_ray import common
from llm_on_ray.finetune.data_process import DataProcessor
from llm_on_ray.finetune.dpo_funetuing import (
DPOFuneTuning,
GaudiDPOFuneTuning,
)

from llm_on_ray.finetune.finetune_config import FinetuneConfig


def train_func(config: Dict[str, Any]):
os.chdir(config["cwd"])
from .finetuning import Finetuning
Expand Down
2 changes: 1 addition & 1 deletion llm_on_ray/finetune/finetune_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ class Dataset(BaseModel):
pad_max: bool = False
torch_dtype: str = "bfloat16"
max_prompt_length: int = 512
torch_dtype: str = "bfloat16"


class RayResourceConfig(BaseModel):
CPU: int
Expand Down
61 changes: 23 additions & 38 deletions llm_on_ray/finetune/finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from pydantic_yaml import parse_yaml_raw_as

from llm_on_ray import common
from llm_on_ray.finetune import template
from llm_on_ray.finetune.data_process import DataProcessor
from llm_on_ray.finetune.finetune_config import FinetuneConfig
from importlib import util

Expand Down Expand Up @@ -134,7 +134,16 @@ def load_tokenizer(self, config: Dict):
else:
tokenizer_name = config["General"]["base_model"]
load_config = config["General"].get("config", {})
tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name, **load_config)
# default padding side is right
padding_side = config["Dataset"].get("padding_side", "right")
# default truncation side is right
truncation_side = config["Dataset"].get("truncation_side", "right")
tokenizer = transformers.AutoTokenizer.from_pretrained(
tokenizer_name,
padding_side=padding_side,
truncation_side=truncation_side,
**load_config,
)
return tokenizer

def load_dataset(self, config: Dict):
Expand Down Expand Up @@ -189,50 +198,27 @@ def local_load(name, **load_config):
return raw_dataset

def tokenize_dataset(self, config: Dict, tokenizer, dataset):
max_length = config["Dataset"].get("max_length", 512)
group = config["Dataset"].get("group", True)
block_size = config["Dataset"].get("block_size", 512)
tokenizer.pad_token = tokenizer.eos_token

if isinstance(dataset, datasets.Dataset):
column_names = dataset.column_names

if isinstance(dataset, datasets.DatasetDict):
column_names = dataset["train"].column_names

if column_names and template.TEXT_COLUMN_NAME not in column_names:

def prompt(rec):
instruction = rec["instruction"]
response = rec["response"]
context = rec.get("context")
if not instruction:
raise ValueError(f"Expected an instruction in: {rec}")
if not response:
raise ValueError(f"Expected a response in: {rec}")
if context:
rec["text"] = template.PROMPT_WITH_INPUT_FORMAT.format(
instruction=instruction, response=response, input=context
)
else:
rec["text"] = template.PROMPT_NO_INPUT_FORMAT.format(
instruction=instruction, response=response
)
return rec
processor = DataProcessor(config, tokenizer)

dataset = dataset.map(
prompt,
load_from_cache_file=False,
desc="Prompt",
)
column_names += [template.TEXT_COLUMN_NAME]
for key in dataset:
prompts = processor.make_prompt(dataset[key])
dataset[key] = datasets.Dataset.from_dict(prompts)

def tokenize_function(examples):
return tokenizer(examples[template.TEXT_COLUMN_NAME], max_length=max_length)
column_names = list(dataset["train"].features)
tokenize_fn = (
processor.tokenize_by_neural_chat
if config["Dataset"].get("data_preprocess_type", "neural_chat") == "neural_chat"
else processor.tokenize
)

tokenized_dataset = dataset.map(
tokenize_function,
tokenize_fn,
remove_columns=column_names,
batched=True,
load_from_cache_file=False,
desc="Tokenize dataset",
)
Expand All @@ -252,7 +238,6 @@ def group_texts(examples):
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
for k, t in concatenated_examples.items()
}
result["labels"] = result["input_ids"].copy()
return result

tokenized_dataset = tokenized_dataset.map(
Expand Down

0 comments on commit fb3152e

Please sign in to comment.