-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtransfer_weights.py
108 lines (86 loc) · 2.92 KB
/
transfer_weights.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import random
import numpy as np
import pandas as pd
import transformers
import accelerate
from accelerate import Accelerator, init_empty_weights
from datasets import Dataset
import json
import os
from huggingface_hub import login
from peft import PeftModel
from datetime import datetime
from dotenv import load_dotenv
hf_access_token = os.getenv('HF_TOKEN')
login(token = hf_access_token)
bnb_config = BitsAndBytesConfig(
load_in_8bit= False,
#llm_int8_enable_fp32_cpu_offload=True,
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)
# Get date and time
from datetime import datetime
now = datetime.now()
model_name = "meta-llama/Llama-2-70b-chat-hf"
# Change this path to match your own working directory
wd = os.getcwd()
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer
import bitsandbytes as bnb
# Assuming you're doing similar for lora_model
#lora_model = AutoModelForCausalLM.from_pretrained(
# model_name,
#low_cpu_mem_usage=True,
# return_dict=True,
# torch_dtype=torch.float16,
# device_map="auto",
# token=hf_access_token,
# quantization_config=bnb_config
#)
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#if torch.cuda.device_count() > 1:
#lora_model = torch.nn.DataParallel(lora_model)
# load quantized model 4 bit
# remove if we have more GPU mem
base_model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
low_cpu_mem_usage=True,
return_dict=True,
torch_dtype=torch.float16,
device_map="auto",
)
# Wrap the base_model using DataParallel
#if torch.cuda.device_count() > 1:
#print(f"Using {torch.cuda.device_count()} GPUs!")
#base_model = torch.nn.DataParallel(base_model)
accelerator = Accelerator()
device = accelerator.device
#base_model.to(device) # Make sure to send the model to the device
#lora_model.to(device) # Send lora_model to the device
# Replace spaces with underscores
model_path = os.path.join(wd,"modelsllama2_70b_2024-04-15_16:44:43.206945")
#lora_model = AutoModelForCausalLM.from_pretrained(
# model_name,
# low_cpu_mem_usage=True,
# return_dict=True,
# torch_dtype=torch.float16,
# device_map={"": 0},
#)
#lora_model.load_adapter(model_path)
merged_model = PeftModel.from_pretrained(base_model, model_path)
merged_model = merged_model.merge_and_unload()
# Save the merged model
model_path = f"merged_llama2_70b_prompt_recovery_model_{now}"
merged_model.save_pretrained(model_path, safe_serialization=True)
# Save the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.save_pretrained(model_path)