-
Notifications
You must be signed in to change notification settings - Fork 1
/
quantize_awq_deepseek.py
60 lines (42 loc) · 2.03 KB
/
quantize_awq_deepseek.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
from argparse import ArgumentParser
import logging
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("--bits", type=int, default=4)
parser.add_argument("--group_size", type=int, default=64)
parser.add_argument("--is_quantized", type=bool, default=True)
args = parser.parse_args()
if args.is_quantized:
w_bit = args.bits
group_size = args.group_size
model_path = "deepseek-ai/deepseek-moe-16b-base"
quant_path = f'/home/LeiFeng/xiaolong/moe_quantize/quantized_deepseek-moe-16b-base-awq-w_bit.{w_bit}-group_size.{group_size}'
quant_config = { "zero_point": True, "q_group_size": group_size, "w_bit": w_bit, "version": "GEMM" }
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(), # Console handler
logging.FileHandler(f'quantize_awq_deepseek-moe-16b-base_bits.{w_bit}_group.{group_size}.log')
])
logger = logging.getLogger(__name__)
# TODO AWQ
model = AutoAWQForCausalLM.from_pretrained(
model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
# Quantize
model.quantize(tokenizer, quant_config=quant_config)
# Save quantized model
model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)
print(f'Model is quantized and saved at "{quant_path}"')
else:
model_path = "deepseek-ai/deepseek-moe-16b-base"
model = AutoAWQForCausalLM.from_pretrained(
model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
print(f'Model is loaded from "{model_path}"')