-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Explore moe #26
base: main
Are you sure you want to change the base?
Explore moe #26
Changes from 13 commits
45e2338
7b68ff1
d2e5753
348dc90
bc20339
c7239ad
d32c6ec
4b583b6
fb071d5
5c6a0c7
61c931b
74cabfa
807748d
b0a10ab
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,82 @@ | ||||||
import torch | ||||||
import transformers | ||||||
from deltazip import AutoDeltaZipModelForCausalLM, BaseCompressionConfig | ||||||
from deltazip.modeling._const import EXPERT_ID_PLACEHOLDER | ||||||
from loguru import logger | ||||||
|
||||||
def to_chatml(prompt): | ||||||
return f"<human>: {prompt}<|endoftext|><assistant>:" | ||||||
|
||||||
def to_lmsys(prompt): | ||||||
return f"User: {prompt} Assistant:" | ||||||
|
||||||
def chat(base_model: str, model_path: str): | ||||||
# print("[deltazip] Loading base model...") | ||||||
logger.info("Loading tokenizer") | ||||||
tokenizer = transformers.AutoTokenizer.from_pretrained(base_model, trust_remote_code=True) | ||||||
logger.info("Tokenizer loaded") | ||||||
|
||||||
logger.info("Loading base_model") | ||||||
base_model = transformers.AutoModelForCausalLM.from_pretrained(f"{model_path}/base/base_model.pt", trust_remote_code=True) | ||||||
# torch.load(f"{model_path}/base_model.pt") | ||||||
base_model = base_model.half() | ||||||
logger.info("Loading base weights") | ||||||
base_weights = torch.load(f"{model_path}/base/base_weights.pt") | ||||||
|
||||||
delta_model = AutoDeltaZipModelForCausalLM.from_compressed( | ||||||
args.model_path, strict=True, device="cpu", unpack=True, trust_remote_code=True | ||||||
) | ||||||
delta_model = delta_model.half() | ||||||
|
||||||
print("base:") | ||||||
print([name for name, param in base_model.named_parameters()]) | ||||||
|
||||||
print("delta:") | ||||||
print([name for name, param in delta_model.named_parameters()]) | ||||||
|
||||||
print(f"base_weights: {base_weights.keys()}") | ||||||
|
||||||
for expert_name, expert_weight in base_weights.items(): | ||||||
prefix, suffix = expert_name.split(EXPERT_ID_PLACEHOLDER) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The
Suggested change
Copilot is powered by AI, so mistakes are possible. Review output carefully before use. |
||||||
for name_base, param_base in base_model.named_parameters(): | ||||||
if name_base.startswith(prefix) and name_base.endswith(suffix): | ||||||
for name_delta, param_delta in delta_model.named_parameters(): | ||||||
if name_delta.endswith(name_base): | ||||||
param_base.data = param_delta.data + expert_weight | ||||||
|
||||||
|
||||||
delta_model = base_model | ||||||
delta_model.to(torch.device("cuda")) | ||||||
print("[deltazip] models loaded") | ||||||
pipe = transformers.TextGenerationPipeline( | ||||||
model=delta_model, tokenizer=tokenizer, device="cuda" | ||||||
) | ||||||
dialogs = "" | ||||||
while True: | ||||||
user_input = input("User: ") | ||||||
if user_input == "\exit": | ||||||
break | ||||||
if user_input == "\reset": | ||||||
dialogs = "" | ||||||
continue | ||||||
model_input = dialogs + to_lmsys(user_input) | ||||||
outputs = pipe( | ||||||
[model_input], | ||||||
max_new_tokens=128, | ||||||
do_sample=True, | ||||||
temperature=0.6, | ||||||
top_k=50, | ||||||
top_p=0.9, | ||||||
return_full_text=False, | ||||||
)[0][0]['generated_text'] | ||||||
print(f"Assistant: {outputs}") | ||||||
dialogs += outputs | ||||||
|
||||||
if __name__ == "__main__": | ||||||
import argparse | ||||||
|
||||||
parser = argparse.ArgumentParser() | ||||||
parser.add_argument("--base-model", type=str, help="Type of model") | ||||||
parser.add_argument("--model-path", type=str, help="Location of model") | ||||||
args = parser.parse_args() | ||||||
chat(args.base_model, args.model_path) |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,177 @@ | ||||||
import accelerate | ||||||
import os | ||||||
import json | ||||||
import torch | ||||||
import argparse | ||||||
from transformers import AutoTokenizer, AutoModelForCausalLM, GPTNeoXTokenizerFast | ||||||
from deltazip import AutoDeltaZipModelForCausalLM, BaseCompressionConfig, base_generation_strategies, modelling_gpt_neox_moe, modeling_llama_moe | ||||||
from deltazip.modeling._const import EXPERT_ID_PLACEHOLDER | ||||||
from loguru import logger | ||||||
from safetensors.torch import save_file | ||||||
import safetensors | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [nitpick] The Copilot is powered by AI, so mistakes are possible. Review output carefully before use. |
||||||
from transformers import GPTNeoXConfig, LlamaConfig | ||||||
|
||||||
|
||||||
def main(args): | ||||||
print(args) | ||||||
compress_config = BaseCompressionConfig( | ||||||
bits=args.bits, | ||||||
sparsity=args.sparsity, | ||||||
# prunen=args.prunen, | ||||||
block_size=args.block_size, | ||||||
# prunem=args.prunem, | ||||||
lossless=args.lossless, | ||||||
damp_percent=args.perc_damp, | ||||||
sym=True, | ||||||
prunen=2, | ||||||
prunem=4 | ||||||
) | ||||||
print("[info] compress config:", compress_config) | ||||||
if args.target_model == "gpt_neox_moe": | ||||||
tokenizer = GPTNeoXTokenizerFast.from_pretrained( | ||||||
args.tokenizer, use_fast=args.fast_tokenizer | ||||||
) | ||||||
with open(f"{args.model_path}/config.json", "r") as fp: | ||||||
config = GPTNeoXConfig(**json.load(fp)) | ||||||
with accelerate.init_empty_weights(): | ||||||
model = modelling_gpt_neox_moe.GPTNeoXForCausalLM(config) | ||||||
model = model.half() | ||||||
model = accelerate.load_checkpoint_and_dispatch( | ||||||
model, checkpoint=f"{args.model_path}/model.safetensors.index.json", device_map="auto", no_split_module_classes=['GPTNeoXLayer'] | ||||||
) | ||||||
model.requires_grad_(False) | ||||||
target_model = AutoDeltaZipModelForCausalLM.from_model( | ||||||
model, compress_config=compress_config | ||||||
) | ||||||
elif args.target_model == 'llama_moe': | ||||||
tokenizer = AutoTokenizer.from_pretrained( | ||||||
args.tokenizer, use_fast=args.fast_tokenizer | ||||||
) | ||||||
with open(f"{args.model_path}/config.json", "r") as fp: | ||||||
config = LlamaConfig(**json.load(fp)) | ||||||
with accelerate.init_empty_weights(): | ||||||
model = modeling_llama_moe.LlamaForCausalLM(config) | ||||||
model = model.half() | ||||||
model = accelerate.load_checkpoint_and_dispatch( | ||||||
model, checkpoint=f"{args.model_path}/model.safetensors.index.json", device_map="auto", no_split_module_classes=['LlamaDecoderLayer'] | ||||||
) | ||||||
model.requires_grad_(False) | ||||||
target_model = AutoDeltaZipModelForCausalLM.from_model( | ||||||
model, compress_config=compress_config | ||||||
) | ||||||
else: | ||||||
tokenizer = AutoTokenizer.from_pretrained( | ||||||
args.target_model, use_fast=args.fast_tokenizer | ||||||
) | ||||||
target_model = AutoDeltaZipModelForCausalLM.from_pretrained( | ||||||
args.target_model, compress_config=compress_config, torch_dtype=torch.float16 | ||||||
) | ||||||
|
||||||
target_model.requires_grad_(False) | ||||||
torch.cuda.empty_cache() | ||||||
# now time to prepare inspect dataset | ||||||
with open(args.dataset, "r") as fp: | ||||||
examples = [json.loads(line)["text"] for line in fp.readlines()] | ||||||
if args.n_samples <= 0: | ||||||
examples = examples | ||||||
else: | ||||||
if args.shuffle_dataset: | ||||||
import random | ||||||
|
||||||
random.seed(42) | ||||||
random.shuffle(examples) | ||||||
examples = examples[: args.n_samples] | ||||||
examples = [tokenizer(x, truncation=True) for x in examples] | ||||||
# examples = [e for e in examples if len(e['attention_mask']) != 0] | ||||||
os.makedirs(args.outdir, exist_ok=True) | ||||||
os.makedirs(f"{args.outdir}/base", exist_ok=True) | ||||||
|
||||||
logger.info("Saving base expert weights:") | ||||||
base_weights = target_model.get_moe_base_weights(base_generation_strategies.take_first) | ||||||
save_file(base_weights, f"{args.outdir}/base/base_weights.safetensors") | ||||||
logger.info("Saving base weights finished") | ||||||
del base_weights | ||||||
|
||||||
target_model.lossy_compress( | ||||||
examples, | ||||||
batch_size=1, | ||||||
is_moe=True | ||||||
) | ||||||
# write to folder | ||||||
logger.info("Saving experts' delta weights:") | ||||||
target_model.save_compressed(args.outdir) | ||||||
|
||||||
if args.target_model == "gpt_neox_moe": | ||||||
model = modelling_gpt_neox_moe.GPTNeoXForCausalLM(config) | ||||||
model = model.half() | ||||||
files = os.listdir(args.model_path) | ||||||
files = [f for f in files if f.endswith("safetensors")] | ||||||
for f in files: | ||||||
print(f"Loading: {args.model_path}/{f}") | ||||||
safetensors.torch.load_model(model, f"{args.model_path}/{f}", strict=False) | ||||||
elif args.target_model == "llama_moe": | ||||||
tokenizer = AutoTokenizer.from_pretrained( | ||||||
args.tokenizer, use_fast=args.fast_tokenizer | ||||||
) | ||||||
with open(f"{args.model_path}/config.json", "r") as fp: | ||||||
config = LlamaConfig(**json.load(fp)) | ||||||
with accelerate.init_empty_weights(): | ||||||
model = modeling_llama_moe.LlamaForCausalLM(config) | ||||||
model = model.half() | ||||||
model = accelerate.load_checkpoint_and_dispatch( | ||||||
model, checkpoint=f"{args.model_path}/model.safetensors.index.json", device_map="auto", no_split_module_classes=['LlamaDecoderLayer', 'LlamaMoE'] | ||||||
) | ||||||
model.requires_grad_(False) | ||||||
else: | ||||||
model = AutoModelForCausalLM.from_pretrained( | ||||||
args.target_model, torch_dtype=torch.float16, trust_remote_code=True | ||||||
) | ||||||
|
||||||
logger.info("Saving non-fc layers:") | ||||||
sd = model.state_dict() | ||||||
to_remove = [] | ||||||
for name in sd.keys(): | ||||||
if name.startswith(target_model.layers_block_name): | ||||||
for inside_layer_module in sum(target_model.inside_layer_modules, []): | ||||||
prefix, suffix = inside_layer_module.split(EXPERT_ID_PLACEHOLDER) | ||||||
if prefix in name and suffix in name and name.endswith(".weight"): | ||||||
to_remove.append(name) | ||||||
|
||||||
# Make sure we only save the non-fc layers (i.e the layers where MoE isn't applied) | ||||||
for name in to_remove: | ||||||
del sd[name] | ||||||
model.save_pretrained(f"{args.outdir}/base/base_model", state_dict=sd) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The
Suggested change
Copilot is powered by AI, so mistakes are possible. Review output carefully before use. |
||||||
logger.info("Saving base model finished") | ||||||
|
||||||
if __name__ == "__main__": | ||||||
parser = argparse.ArgumentParser() | ||||||
parser.add_argument( | ||||||
"--dataset", | ||||||
type=str, | ||||||
default="answer_verification", | ||||||
help="The dataset to use for training, must be a path to a jsonl file.", | ||||||
) | ||||||
parser.add_argument( | ||||||
"--n-samples", | ||||||
type=int, | ||||||
default=-1, | ||||||
help="How many data samples used for calibration, -1 means all.", | ||||||
) | ||||||
parser.add_argument("--target-model", type=str) | ||||||
parser.add_argument("--model-path", type=str) | ||||||
parser.add_argument("--tokenizer", type=str) | ||||||
parser.add_argument("--sparsity", type=float, default=0.5) | ||||||
parser.add_argument("--bits", type=int, default=4) | ||||||
parser.add_argument("--block-size", type=int, default=128) | ||||||
parser.add_argument("--prunen", type=int, default=0) | ||||||
parser.add_argument("--prunem", type=int, default=0) | ||||||
parser.add_argument( | ||||||
"--lossless", type=str, default="gdeflate", choices=["gdeflate"] | ||||||
) | ||||||
parser.add_argument("--delta", type=str, choices=["subtract", "xor"], default="") | ||||||
parser.add_argument("--perc-damp", type=float, default=0.01) | ||||||
parser.add_argument("--outdir", type=str, default=".cache/compressed_models") | ||||||
parser.add_argument("--fast-tokenizer", action="store_true") | ||||||
parser.add_argument("--shuffle-dataset", action="store_false") | ||||||
args = parser.parse_args() | ||||||
main(args) |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,68 @@ | ||||||
import json | ||||||
import torch | ||||||
import transformers | ||||||
from deltazip import AutoDeltaZipModelForCausalLM, BaseCompressionConfig, modelling_gpt_neox_moe | ||||||
from deltazip.modeling._const import EXPERT_ID_PLACEHOLDER | ||||||
from loguru import logger | ||||||
from safetensors.torch import load_file, load_model | ||||||
|
||||||
def save(model_type, model_path): | ||||||
logger.info("Loading tokenizer") | ||||||
if model_type == "gpt-neox-moe": | ||||||
pass | ||||||
else: | ||||||
tokenizer = transformers.AutoTokenizer.from_pretrained(model_type, trust_remote_code=True) | ||||||
logger.info("Tokenizer loaded") | ||||||
logger.info("Loading base_model") | ||||||
|
||||||
delta_model = None | ||||||
config=None | ||||||
if model_type == "gpt-neox-moe": | ||||||
with open(f"{args.model_path}/base/base_model/config.json", "r") as fp: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The variable 'args' is used instead of 'model_path'. It should be 'model_path' instead of 'args.model_path'.
Suggested change
Copilot is powered by AI, so mistakes are possible. Review output carefully before use. |
||||||
config = transformers.GPTNeoXConfig(**json.load(fp)) | ||||||
base_model = modelling_gpt_neox_moe.GPTNeoXForCausalLM(config) | ||||||
base_model = base_model.half() | ||||||
delta_model = modelling_gpt_neox_moe.GPTNeoXForCausalLM(config) | ||||||
delta_model = delta_model.half() | ||||||
load_model(base_model, f"{args.model_path}/base/base_model/model.safetensors", strict=False) | ||||||
else: | ||||||
base_model = transformers.AutoModelForCausalLM.from_pretrained(f"{model_path}/base/base_model", trust_remote_code=True) | ||||||
|
||||||
base_model = base_model.half() | ||||||
logger.info("Loading base weights") | ||||||
base_weights = load_file(f"{model_path}/base/base_weights.safetensors") | ||||||
|
||||||
delta_model = AutoDeltaZipModelForCausalLM.from_compressed( | ||||||
args.model_path, strict=True, device="cpu", unpack=True, trust_remote_code=True, model_config=config, custom_model = delta_model | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The variable 'args' is used instead of 'model_path'. It should be 'model_path' instead of 'args.model_path'.
Suggested change
Copilot is powered by AI, so mistakes are possible. Review output carefully before use. |
||||||
) | ||||||
delta_model = delta_model.half() | ||||||
logger.info("Loading delta weights") | ||||||
# print([n for n, _ in delta_model.named_parameters()]) | ||||||
for expert_name, expert_weight in base_weights.items(): | ||||||
prefix, suffix = expert_name.split(EXPERT_ID_PLACEHOLDER) | ||||||
for name_base, param_base in base_model.named_parameters(): | ||||||
if name_base.startswith(prefix) and name_base.endswith(suffix): | ||||||
# print(expert_name, name_base) | ||||||
for name_delta, param_delta in delta_model.named_parameters(): | ||||||
# print(expert_name, name_base, name_delta) | ||||||
if name_delta.endswith(name_base): | ||||||
print("Merging weights: ", name_base, name_delta) | ||||||
param_base.data = param_delta.data + expert_weight | ||||||
param_base.data = param_base.data.contiguous() | ||||||
|
||||||
delta_model = base_model | ||||||
if model_type == "gpt-neox-moe": | ||||||
pass | ||||||
else: | ||||||
tokenizer.save_pretrained(f"{model_path}/complete_model") | ||||||
logger.info("Saving complete model") | ||||||
delta_model.save_pretrained(f"{model_path}/complete_model") | ||||||
|
||||||
if __name__ == "__main__": | ||||||
import argparse | ||||||
|
||||||
parser = argparse.ArgumentParser() | ||||||
parser.add_argument("--model-type", type=str, help="Type of model") | ||||||
parser.add_argument("--model-path", type=str, help="Directory of compressed model") | ||||||
args = parser.parse_args() | ||||||
save(args.model_type, args.model_path) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The path should be a directory containing model configurations, not a .pt file.
Copilot is powered by AI, so mistakes are possible. Review output carefully before use.