Skip to content

Commit

Permalink
chatglm
Browse files Browse the repository at this point in the history
  • Loading branch information
loxs123 committed Jan 9, 2024
1 parent 4882e60 commit 8e2b6ee
Show file tree
Hide file tree
Showing 13 changed files with 365 additions and 161 deletions.
17 changes: 6 additions & 11 deletions projects/ChatGLM/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,18 @@ The ChatGLM Supervised FineTuning project can support 3D parallel.
## FineTuning ChatGLM3
FineTuning ChatGLM3 on 8 GPUs using parallelism.

### 1. Prepare the sft dataset
#### download dataset
### 1. Prepare environment variables
```bash
export DATA_DIR=~/DATA # [At the beginning, it was an empty folder]
cd $DATA_DIR
git clone https://www.modelscope.cn/datasets/YorickHe/CoT_zh.git
export DATA_DIR=~/DATA/alpaca # [At the beginning, it was an empty folder]
export CHATGLM_HF_DIR=modelscope/hub/ZhipuAI/chatglm3-6b # [Your ChatGLM huggingface path]
```

### 2. Prepare the sft dataset
#### preprocess
```bash
cd projects/ChatGLM
python utils/prepare_CoT_zh.py
python utils/prepare_data_alpaca.py
```
### 2. Prepare your finetuning config file

> set the finetuning parameters in `projects/ChatGLM/configs/chatglm_sft.py`, such as `dataset_path` and `pretrained_model_path`.

### 3. Run the following code to start SFT
```bash
Expand Down Expand Up @@ -51,4 +46,4 @@ python projects/ChatGLM/pipeline.py
- set `projects/ChatGLM/configs/chatglm_config.py`, lora_enable=True, same step with no lora.

### ChatGLM Lora Inference
- set `projects/ChatGLM/configs/chatglm_config.py`, lora_enable=True, lora_pretrained_model_path, same step with no lora.
- set `projects/ChatGLM/configs/chatglm_config.py`, lora_enable=True, same step with no lora.
6 changes: 4 additions & 2 deletions projects/ChatGLM/chatglm.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from libai.layers import LayerNorm, Linear, RMSLayerNorm, VocabEmbedding
from libai.utils import distributed as dist


def apply_rotary_pos_emb(x: flow.Tensor, rope_cache: flow.Tensor) -> flow.Tensor:
# x: [sq, b, np, hn]
sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3)
Expand Down Expand Up @@ -122,7 +123,6 @@ def __init__(self, cfg, layer_number):
if self.apply_query_key_layer_scaling:
self.attention_softmax_in_fp32 = True
self.layer_number = max(1, layer_number)

projection_size = cfg.kv_channels * cfg.num_attention_heads

# Per attention head and per partition values.
Expand Down Expand Up @@ -894,7 +894,9 @@ def forward(
shift_logits = lm_logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens
loss = self.loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
loss = self.loss_fct(
shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
)
# loss = loss.mean()

if labels is not None:
Expand Down
24 changes: 10 additions & 14 deletions projects/ChatGLM/configs/chatglm_config.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from omegaconf import DictConfig, OmegaConf

from libai.config import LazyCall
Expand All @@ -14,7 +15,6 @@
apply_residual_connection_post_layernorm=False,
attention_dropout=0.0,
attention_softmax_in_fp32=True,
bias_dropout_fusion=True,
ffn_hidden_size=13696,
fp32_residual_connection=False,
hidden_dropout=0.0,
Expand All @@ -25,13 +25,11 @@
multi_query_group_num=2,
num_attention_heads=32,
num_layers=28,
original_rope=True,
padded_vocab_size=65024,
post_layer_norm=True,
rmsnorm=True,
seq_length=8192,
use_cache=True,
dtype="float16",
tie_word_embeddings=False,
eos_token_id=2,
bos_token_id=1,
Expand All @@ -42,7 +40,7 @@
amp_enabled=True,
# Inference
is_encoder_decoder=False,
max_length=256,
max_length=1350,
min_length=0,
do_sample=False,
early_stopping=False,
Expand All @@ -62,26 +60,24 @@
output_scores=False,
output_hidden_states=False,
# train
pretrained_model_path="YOUR_CHATGLM_HUGGINGFACE_PATH",

pretrained_model_path=os.environ["CHATGLM_HF_DIR"],
# lora_cfg
lora_enable = False,
lora_cfg = dict(
lora_enable=False,
lora_cfg=dict(
# Model
r=8,
target_modules=['query_key_value'],
target_modules=["query_key_value"],
lora_alpha=8,
lora_dropout=0.0,
fan_in_fan_out=False,
bias='lora_only',
hidden_layers=32,
bias="lora_only",
modules_to_save=None,
init_lora_weights=True, # or lora
init_lora_weights=True, # or lora
inference_mode=False,
rank_pattern=dict(),
alpha_pattern=dict(),
),
lora_pretrained_model_path = None # None for train
lora_pretrained_model_path=None, # None for train
)

cfg = DictConfig(cfg)
Expand All @@ -90,5 +86,5 @@
tokenization = OmegaConf.create()
tokenization.make_vocab_size_divisible_by = 1
tokenization.tokenizer = LazyCall(ChatGLMTokenizer)(
vocab_file="YOUR_CHATGLM_HUGGINGFACE_PATH/tokenizer.model"
vocab_file=f"{os.environ['CHATGLM_HF_DIR']}/tokenizer.model"
)
13 changes: 7 additions & 6 deletions projects/ChatGLM/configs/chatglm_sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,14 @@
from projects.ChatGLM.tokenizer import ChatGLMTokenizer
from projects.ChatGLM.chatglm import ChatGLMForConditionalGeneration


# Hyperparameters
weight_decay = 0.1
learning_rate = 2e-5
max_source_len = 128
max_target_len = 128
dataset_path = "YOUR_DATA_PATH/CoT_zh"
pretrained_model_path = "YOUR_CHATGLM_HUGGINGFACE_PATH"
fast_dev_run=True
max_length = 256
dataset_path = os.environ["DATA_DIR"]
pretrained_model_path = os.environ["CHATGLM_HF_DIR"]

# graph & optim
graph["enabled"] = True
Expand Down Expand Up @@ -54,6 +53,7 @@
tokenizer=tokenization.tokenizer,
max_source_len=max_source_len,
max_target_len=max_target_len,
max_length=max_length,
)
]
)
Expand All @@ -64,14 +64,15 @@
tokenizer=tokenization.tokenizer,
max_source_len=max_source_len,
max_target_len=max_target_len,
max_length=max_length,
)
),
]

train.update(
dict(
output_dir="./sft_result",
train_micro_batch_size=2,
train_micro_batch_size=1,
test_micro_batch_size=1,
train_epoch=3,
train_iter=1,
Expand All @@ -86,7 +87,7 @@
max_to_keep=1,
),
dist=dict(
data_parallel_size=2,
data_parallel_size=1,
tensor_parallel_size=1,
pipeline_parallel_size=4,
pipeline_num_layers=cfg.num_layers,
Expand Down
35 changes: 20 additions & 15 deletions projects/ChatGLM/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import oneflow as flow
from oneflow.utils.data import Dataset
from tqdm import tqdm

from libai.data.structures import DistTensorData, Instance
from libai.utils import distributed as dist
Expand All @@ -28,13 +29,17 @@


class ChatGLMTrainDataset(Dataset):
def __init__(self, path, tokenizer, max_source_len=128, max_target_len=128):
def __init__(self, path, tokenizer, max_source_len=128, max_target_len=128, max_length=None):
with open(path, "r", encoding="utf-8") as f:
self.data = json.load(f)
self.tokenizer = tokenizer
self.max_source_len = max_source_len
self.max_target_len = max_target_len
self.max_len = max_source_len + max_target_len + 1
if max_length is None:
self.max_len = max_source_len + max_target_len + 1
else:
self.max_len = max_length

example = self._preprocess(0)
self.log_dataset_example(example)

Expand All @@ -45,25 +50,25 @@ def _preprocess(self, idx):
item = {key: self.data[key][idx] for key in self.data}
# prompt, query, response

source_ids = self.tokenizer.encode(item["prompt"] + item["query"], add_special_tokens=True)[0]
source_ids = self.tokenizer.encode(item["prompt"] + item["query"], add_special_tokens=True)[
0
]
source_ids = source_ids[: self.max_source_len]

target_ids = self.tokenizer.encode(item["response"], add_special_tokens=True)[0]
target_ids = self.tokenizer.encode(item["response"], add_special_tokens=False)[0]
target_ids = target_ids[: self.max_target_len]

input_ids = source_ids + target_ids + [self.tokenizer.eos_token_id]
labels = (
[self.tokenizer.pad_token_id] * len(source_ids)
+ target_ids
+ [self.tokenizer.eos_token_id]
)
input_ids = source_ids + target_ids
labels = [self.tokenizer.pad_token_id] * len(source_ids) + target_ids

input_ids = input_ids[: self.max_len - 1] + [self.tokenizer.eos_token_id]
labels = labels[: self.max_len - 1] + [self.tokenizer.eos_token_id]

# pad
# left pad
pad_len = self.max_len - len(input_ids)
input_ids = input_ids + [self.tokenizer.pad_token_id] * pad_len
labels = labels + [self.tokenizer.pad_token_id] * pad_len
input_ids = [self.tokenizer.pad_token_id] * pad_len + input_ids
labels = [self.tokenizer.pad_token_id] * pad_len + labels
labels = [(l if l != self.tokenizer.pad_token_id else IGNORE_INDEX) for l in labels]
assert len(input_ids) == len(labels), f"length mismatch: {len(input_ids)} vs {len(labels)}"

return {"input_ids": input_ids, "labels": labels}

Expand Down Expand Up @@ -92,5 +97,5 @@ def __getitem__(self, index):
item = self._preprocess(index)
return Instance(
input_ids=DistTensorData(flow.LongTensor(item["input_ids"])),
labels=DistTensorData(flow.LongTensor(item["labels"])),
labels=DistTensorData(flow.LongTensor(item["labels"]), placement_idx=-1),
)
29 changes: 24 additions & 5 deletions projects/ChatGLM/lora/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,22 @@

import math
import warnings
from abc import ABC
from typing import Any, List, Optional, Union
from abc import ABC, abstractmethod

import oneflow as flow
import oneflow.nn as nn
import oneflow.nn.functional as F

from libai.layers import Linear as Linear_


def transpose(weight, fan_in_fan_out):
if not fan_in_fan_out:
return weight
return weight.T


class BaseTunerLayer(ABC):
r"""
A tuner layer mixin that provides the common methods and attributes for all tuners.
Expand Down Expand Up @@ -185,6 +187,7 @@ def delete_adapter(self, adapter_name: str) -> None:
)
self.set_adapter(remaining_adapters[0])


class LoraLayer(BaseTunerLayer):
# All names of layers that may contain (trainable) adapter weights
adapter_layer_names = ("lora_A", "lora_B", "lora_embedding_A", "lora_embedding_B")
Expand Down Expand Up @@ -229,8 +232,20 @@ def update_layer(self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weig
self.lora_dropout.update(nn.ModuleDict({adapter_name: lora_dropout_layer}))
# Actual trainable parameters
if r > 0:
self.lora_A[adapter_name] = Linear_(self.in_features, r, bias=False,parallel='col',layer_idx=self.kwargs.get('layer_idx',0))
self.lora_B[adapter_name] = Linear_(r, self.out_features, bias=False,parallel='row',layer_idx=self.kwargs.get('layer_idx',0))
self.lora_A[adapter_name] = Linear_(
self.in_features,
r,
bias=False,
parallel="col",
layer_idx=self.kwargs.get("layer_idx", 0),
)
self.lora_B[adapter_name] = Linear_(
r,
self.out_features,
bias=False,
parallel="row",
layer_idx=self.kwargs.get("layer_idx", 0),
)
self.scaling[adapter_name] = lora_alpha / r

if init_lora_weights:
Expand Down Expand Up @@ -279,7 +294,9 @@ def unscale_layer(self, scale=None) -> None:
continue

if scale is None:
self.scaling[active_adapter] = self.lora_alpha[active_adapter] / self.r[active_adapter]
self.scaling[active_adapter] = (
self.lora_alpha[active_adapter] / self.r[active_adapter]
)
else:
self.scaling[active_adapter] /= scale

Expand Down Expand Up @@ -379,7 +396,9 @@ def get_delta_weight(self, adapter) -> flow.Tensor:
weight_A = self.lora_A[adapter].weight
weight_B = self.lora_B[adapter].weight

output_tensor = transpose(flow.matmul(weight_B , weight_A), self.fan_in_fan_out) * self.scaling[adapter]
output_tensor = (
transpose(flow.matmul(weight_B, weight_A), self.fan_in_fan_out) * self.scaling[adapter]
)

return output_tensor

Expand Down
Loading

0 comments on commit 8e2b6ee

Please sign in to comment.