Skip to content

Commit

Permalink
ctransformers: another attempt
Browse files Browse the repository at this point in the history
Generalized ctransformers based on:
#2892
Credits to randoentity
  • Loading branch information
cal066 committed Jul 26, 2023
1 parent 08c622d commit d6f3e91
Show file tree
Hide file tree
Showing 6 changed files with 144 additions and 9 deletions.
74 changes: 74 additions & 0 deletions modules/ctransformers_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from ctransformers import AutoModelForCausalLM
from ctransformers import AutoConfig

from modules import shared
from modules.callbacks import Iteratorize
from modules.logging_colors import logger

class CtransformersModel:
def __init__(self):
pass

@classmethod
def from_pretrained(self, path):
result = self()

# ctransformers uses -1 for random seed
config = AutoConfig.from_pretrained(
str(path),
stop=["<|end|>"],
threads=shared.args.threads,
gpu_layers=shared.args.n_gpu_layers,
batch_size=shared.args.n_batch,
stream=not shared.args.no_stream,
seed=(-1 if shared.args.llama_cpp_seed == 0 else shared.args.llama_cpp_seed)
)
self.model = AutoModelForCausalLM.from_pretrained(
str(result.model_dir(path) if result.model_type_is_auto() else path),
model_type=(None if result.model_type_is_auto() else shared.args.model_type),
config=config
)
logger.info(f'Using ctransformers model_type: {self.model.model_type} for {self.model.model_path}')
return result, result

def model_type_is_auto(self):
return shared.args.model_type == "Auto" || shared.args.model_type == "None"

def model_dir(self, path):
if path.is_file():
return path.parent
return path

def encode(self, string, **kwargs):
return self.model.tokenize(string)

def decode(self, ids):
return self.model.detokenize(ids)


def generate(self, prompt, state, callback=None):
prompt = prompt if type(prompt) is str else prompt.decode()
generator = self.model._stream(
prompt=prompt,
max_new_tokens=state['max_new_tokens'],
temperature=state['temperature'],
top_p=state['top_p'],
top_k=state['top_k'],
repetition_penalty=state['repetition_penalty'],
threads=shared.args.threads
)

output = ""
for token in generator:
if callback:
callback(token)
output += token
return output


def generate_with_streaming(self, *args, **kwargs):
with Iteratorize(self.generate, args, kwargs, callback=None) as generator:
reply = ''
for token in generator:
reply += token
yield reply
34 changes: 34 additions & 0 deletions modules/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,43 @@
'compress_pos_emb',
'alpha_value',
'exllama_HF_info',
],
'ctransformers': [
'n_ctx',
'n_gpu_layers',
'n_batch',
'threads',
'no_mmap',
'mlock',
'model_type',
'llama_cpp_seed',
]
}

model_loader_type_table = {
'GPTQ-for-LLaMa': [
"None",
"llama",
"opt",
"gptj"
],
'ctransformers': [
"None",
"gptj",
"gpt_neox",
"llama",
"mpt",
"dolly-v2"
"replit",
"starcoder",
"falcon"
],
}

def model_loader_type(loader):
if loader in model_loader_type_table:
return model_loader_type_table[loader]
return ["None"]

def get_gpu_memory_keys():
return [k for k in shared.gradio if k.startswith('gpu_memory')]
Expand Down
22 changes: 21 additions & 1 deletion modules/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ def load_model(model_name, loader=None):
'FlexGen': flexgen_loader,
'RWKV': RWKV_loader,
'ExLlama': ExLlama_loader,
'ExLlama_HF': ExLlama_HF_loader
'ExLlama_HF': ExLlama_HF_loader,
'ctransformers': CtransformorsModel_loader,
}

p = Path(model_name)
Expand Down Expand Up @@ -290,6 +291,25 @@ def llamacpp_HF_loader(model_name):
return model, tokenizer


def CtransformorsModel_loader(model_name):
from modules.ctransformers_model import CtransformersModel

path = Path(f'{shared.args.model_dir}/{model_name}')
logger.info(f'ctransformers loading: {path}\n')
ctrans = CtransformersModel()
if ctrans.model_type_is_auto():
model_file = path
else:
if path.is_file():
model_file = path
else:
model_file = list(
Path(f'{shared.args.model_dir}/{model_name}').glob('*.bin')
)[0]
logger.info(f'ctransformers weights detected: {model_file}\n')
model, tokenizer = ctrans.from_pretrained(model_file)
return model, tokenizer

def GPTQ_loader(model_name):

# Monkey patch
Expand Down
2 changes: 1 addition & 1 deletion modules/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
'autoload_model': False,
'max_new_tokens': 200,
'max_new_tokens_min': 1,
'max_new_tokens_max': 4096,
'max_new_tokens_max': 8000,
'seed': -1,
'character': 'None',
'name1': 'You',
Expand Down
16 changes: 11 additions & 5 deletions modules/text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,10 @@ def generate_reply(*args, **kwargs):
def get_max_prompt_length(state):
return state['truncation_length'] - state['max_new_tokens']


encode_llama_prompts = ['LlamaCppModel', 'RWKVModel', 'CtransformersModel']
encode_llama_truncation = ['LlamaCppModel', 'RWKVModel', 'ExllamaModel', 'CtransformersModel']
def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_length=None):
if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel']:
if shared.model.__class__.__name__ in encode_llama_prompts:
input_ids = shared.tokenizer.encode(str(prompt))
input_ids = np.array(input_ids).reshape(1, len(input_ids))
return input_ids
Expand All @@ -51,7 +52,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
if truncation_length is not None:
input_ids = input_ids[:, -truncation_length:]

if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel'] or shared.args.cpu:
if shared.model.__class__.__name__ in encode_llama_truncation or shared.args.cpu:
return input_ids
elif shared.args.flexgen:
return input_ids.numpy()
Expand Down Expand Up @@ -171,7 +172,12 @@ def apply_stopping_strings(reply, all_stop_strings):

return reply, stop_found


_generate_reply_use_custom = [
'LlamaCppModel',
'RWKVModel',
'ExllamaModel',
'CtransformersModel'
]
def _generate_reply(question, state, stopping_strings=None, is_chat=False):
generate_func = apply_extensions('custom_generate_reply')
if generate_func is None:
Expand All @@ -180,7 +186,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False):
yield ''
return

if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel']:
if shared.model.__class__.__name__ in _generate_reply_use_custom:
generate_func = generate_reply_custom
elif shared.args.flexgen:
generate_func = generate_reply_flexgen
Expand Down
5 changes: 3 additions & 2 deletions server.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def create_model_menus():

with gr.Row():
with gr.Column():
shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=["Transformers", "ExLlama_HF", "ExLlama", "AutoGPTQ", "GPTQ-for-LLaMa", "llama.cpp", "llamacpp_HF"], value=None)
shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys(), value=None)
with gr.Box():
with gr.Row():
with gr.Column():
Expand All @@ -223,7 +223,7 @@ def create_model_menus():
shared.gradio['rms_norm_eps'] = gr.Slider(minimum=0, maximum=1e-5, step=1e-6, label="rms_norm_eps", value=shared.args.n_gqa, info='Must be 1e-5 for llama2 70b.')
shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=str(shared.args.wbits) if shared.args.wbits > 0 else "None")
shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=str(shared.args.groupsize) if shared.args.groupsize > 0 else "None")
shared.gradio['model_type'] = gr.Dropdown(label="model_type", choices=["None", "llama", "opt", "gptj"], value=shared.args.model_type or "None")
shared.gradio['model_type'] = gr.Dropdown(label="model_type", choices=["None"], value=shared.args.model_type or "None")
shared.gradio['pre_layer'] = gr.Slider(label="pre_layer", minimum=0, maximum=100, value=shared.args.pre_layer[0] if shared.args.pre_layer is not None else 0)
shared.gradio['autogptq_info'] = gr.Markdown('* ExLlama_HF is recommended over AutoGPTQ for models derived from LLaMA.')
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
Expand Down Expand Up @@ -265,6 +265,7 @@ def create_model_menus():
shared.gradio['model_status'] = gr.Markdown('No model is loaded' if shared.model_name == 'None' else 'Ready')

shared.gradio['loader'].change(loaders.make_loader_params_visible, gradio('loader'), gradio(loaders.get_all_params()))
shared.gradio['loader'].change(fn=lambda value: gr.update(choices=loaders.model_loader_type(value)), inputs=shared.gradio['loader'], outputs=shared.gradio['model_type'])

# In this event handler, the interface state is read and updated
# with the model defaults (if any), and then the model is loaded
Expand Down

0 comments on commit d6f3e91

Please sign in to comment.