diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 22159ae..2b7f490 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -14,6 +14,7 @@ repos: hooks: - id: isort name: Format imports + args: ["--profile", "black", "--filter-files"] - repo: https://github.com/PyCQA/flake8 rev: 6.1.0 hooks: diff --git a/README.md b/README.md index e4cfb98..258fcdf 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,11 @@ Invoker is a suite of large language models based on Llama-2 and is finetuned to plan between calling functions and providing responses directly. Currently, we have released the 13B version and there are plans for the 7B and 34B versions to be trained and released in the future. +## News + +- [2023/09] We released **Invoker-13B-GPTQ**, which is a 4-bit quantized GPTQ implementation of Invoker-13B. Download [weights](https://huggingface.co/jeffrey-fong/invoker-13b-GPTQ). We also added ExllamaV2 integration! +- [2023/09] We released **Invoker-13B**, a model trained on function-calling and multi-turn conversation datasets. Download [weights](https://huggingface.co/jeffrey-fong/invoker-13b) + ## Installation & Usage The usage of Invoker follows exactly like OpenAI's function calling. Simply install the required dependencies: @@ -20,20 +25,29 @@ pip install -r requirements.txt #### Launching the Server -Kick-start the FastAPI server. You can indicate the model via environment variables. The list of models are indicated [here](#download). +Kick-start the FastAPI server. You can indicate the model details via environment variables. The Invoker server currently supports 2 different ways to load the model. If you would like to load the full fp16 model using HuggingFace transformers, run the following commands: ```shell +EXPORT INVOKER_MODEL_TYPE=hf EXPORT INVOKER_MODEL_NAME_OR_PATH=jeffrey-fong/invoker-13b uvicorn server_fastapi:app ``` -There are plans to set up accelerated servers based on [ExLlama](https://github.com/turboderp/exllama) and/or [ExLlamaV2](https://github.com/turboderp/exllamav2) that can work with GPTQ-based models. Stay tuned for more updates! +If you would like to load 4-bit quantized Invoker GPTQ models, using [ExLlamaV2](https://github.com/turboderp/exllamav2), run the following commands: + +```shell +EXPORT INVOKER_MODEL_TYPE=exllamav2 +EXPORT INVOKER_MODEL_NAME_OR_PATH=jeffrey-fong/invoker-13b-GPTQ +uvicorn server_fastapi:app +``` + +The full list of models are indicated [here](#download). #### Inference Inference can then be performed exactly like OpenAI function-calling. Provide the chat and the functions in the `messages` and `functions` arguments respectively. Invoker also supports the following generation hyperparameters: -- `temperature: float = 0.7` Accepts values between 0.0 and 1.0. Defaults to 0.7 if the temperature is not passed in. +- `temperature: float = 0.5` Accepts values between 0.0 and 1.0. Defaults to 0.5 if the temperature is not passed in. - `top_p: float = 1.0` Accepts values between 0.0 and 1.0. Defaults to 1.0 if the top_p is not passed in. ```python @@ -108,7 +122,7 @@ Please refer to the model card in HuggingFace to see how to use the model direct | Model | Link | Version | | ------------- | ------------- |------------- | | Invoker-13B | [Huggingface Repo](https://huggingface.co/jeffrey-fong/invoker-13b) |v1.0| -| Invoker-13B-GPTQ | Coming Soon |v1.0| +| Invoker-13B-GPTQ | [Huggingface Repo](https://huggingface.co/jeffrey-fong/invoker-13b-GPTQ) |v1.0| | Invoker-7B | Coming Soon |v1.0| | Invoker-34B | Coming Soon |v1.0| @@ -153,9 +167,9 @@ All the datasets used are under Apache-2.0 License. Therefore, this dataset will ## To-Dos +- [X] Quantize 13B model +- [X] Work on GPTQ-based servers ([ExLlama](https://github.com/turboderp/exllama) and/or [ExLlamaV2](https://github.com/turboderp/exllamav2)) - [ ] Work on validating function names, descriptions, etc. Just like OpenAI's function calling -- [ ] Quantize 13B model -- [ ] Work on GPTQ-based servers ([ExLlama](https://github.com/turboderp/exllama) and/or [ExLlamaV2](https://github.com/turboderp/exllamav2)) - [ ] Converting Invoker to other formats like: - [ ] GGUF - [ ] AWQ diff --git a/invoker/api_types.py b/invoker/api_types.py index fb524b3..35e53f0 100644 --- a/invoker/api_types.py +++ b/invoker/api_types.py @@ -31,7 +31,7 @@ class ChatInput(BaseModel): model: str messages: List[Message] functions: Optional[List[Function]] = None - temperature: float = 0.7 + temperature: float = 0.5 top_p: float = 1.0 diff --git a/invoker/model.py b/invoker/model.py index d02c285..ac84ecd 100644 --- a/invoker/model.py +++ b/invoker/model.py @@ -5,19 +5,36 @@ from typing import Any, Dict, List, Optional import torch +from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Config, ExLlamaV2Tokenizer +from exllamav2.generator import ExLlamaV2BaseGenerator, ExLlamaV2Sampler from transformers import LlamaForCausalLM, LlamaTokenizer from invoker.api_types import Function, Message +from invoker.utils.enum_tags import ModelType class InvokerPipeline: # Singleton instance _pipeline = None - def __init__(self, model_path: str): - self._tokenizer = LlamaTokenizer.from_pretrained(model_path, use_fast=False) + def __init__(self, model_path: str, model_type: ModelType): # Load model - self._model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto") + self._model_type = model_type + if model_type == ModelType.exllamav2: + config = ExLlamaV2Config() + config.model_dir = model_path + config.prepare() + model = ExLlamaV2(config) + model.load() + self._tokenizer = ExLlamaV2Tokenizer(config) + cache = ExLlamaV2Cache(model) + self._generator = ExLlamaV2BaseGenerator(model, cache, self._tokenizer) + self._generator.warmup() + self._settings = ExLlamaV2Sampler.Settings() + self._settings.token_repetition_penalty = 1.0 + else: + self._tokenizer = LlamaTokenizer.from_pretrained(model_path, use_fast=False) + self._model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto") def format_message(self, messages: List[Message], functions: Optional[List[Function]]): prompt = "Available Functions:" @@ -55,19 +72,21 @@ def format_message(self, messages: List[Message], functions: Optional[List[Funct return prompt def generate(self, input_text: str, params: Dict[str, Any]) -> str: - # Tokenize the input - input_ids = self._tokenizer(input_text, return_tensors="pt").input_ids.cuda() - # Run the model to infer an output temperature, top_p = params.get("temperature"), params.get("top_p") - do_sample = True if temperature > 0.0 else False - output_ids = self._model.generate( - input_ids=input_ids, - max_new_tokens=512, - do_sample=do_sample, - top_p=top_p, - temperature=temperature, - ) - raw_output = self._tokenizer.decode(output_ids[0], skip_special_tokens=True) + if self._model_type == ModelType.exllamav2: + self._settings.temperature, self._settings.top_p = temperature, top_p + raw_output = self._generator.generate_simple(input_text, self._settings, num_tokens=512) + else: + input_ids = self._tokenizer(input_text, return_tensors="pt").input_ids.cuda() + do_sample = True if temperature > 0.0 else False + output_ids = self._model.generate( + input_ids=input_ids, + max_new_tokens=512, + do_sample=do_sample, + top_p=top_p, + temperature=temperature, + ) + raw_output = self._tokenizer.decode(output_ids[0], skip_special_tokens=True) output = raw_output[len(input_text) :] choices = self._postprocess(text=output) return choices @@ -100,9 +119,9 @@ def _postprocess(self, text): return choices @classmethod - async def maybe_init(cls, model_path: str) -> InvokerPipeline: + async def maybe_init(cls, model_path: str, model_type: ModelType) -> InvokerPipeline: if cls._pipeline is None: - cls._pipeline = InvokerPipeline(model_path=model_path) + cls._pipeline = InvokerPipeline(model_path=model_path, model_type=model_type) if cls._pipeline is not None: return cls._pipeline else: diff --git a/invoker/utils/enum_tags.py b/invoker/utils/enum_tags.py new file mode 100644 index 0000000..e7f7535 --- /dev/null +++ b/invoker/utils/enum_tags.py @@ -0,0 +1,6 @@ +from enum import Enum + + +class ModelType(str, Enum): + hf = "hf" + exllamav2 = "exllamav2" diff --git a/requirements.txt b/requirements.txt index d534742..ecc2adb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,5 @@ accelerate==0.22.0 fastapi==0.103.0 uvicorn==0.23.2 pydantic-settings==2.0.3 -openai==0.28.0 \ No newline at end of file +openai==0.28.0 +exllamav2==0.0.3 \ No newline at end of file diff --git a/server_fastapi.py b/server_fastapi.py index f2eca04..6068561 100644 --- a/server_fastapi.py +++ b/server_fastapi.py @@ -7,16 +7,18 @@ from invoker.api_types import ChatInput, ChatOutput from invoker.model import InvokerPipeline - - -async def get_pipeline(model_path: str): - return await InvokerPipeline.maybe_init(model_path=model_path) +from invoker.utils.enum_tags import ModelType class Settings(BaseSettings): + invoker_model_type: ModelType = Field("hf", env="INVOKER_MODEL_TYPE") invoker_model_name_or_path: str = Field("jeffrey-fong/invoker-13b", env="INVOKER_MODEL_NAME_OR_PATH") +async def get_pipeline(model_path: str, model_type: ModelType): + return await InvokerPipeline.maybe_init(model_path=model_path, model_type=model_type) + + app = FastAPI(title="Invoker") settings = Settings() @@ -24,7 +26,9 @@ class Settings(BaseSettings): @app.post("/chat/completions", response_model=ChatOutput) async def chat(req: ChatInput): id = str(uuid.uuid4()) - invoker_pipeline: InvokerPipeline = await get_pipeline(model_path=settings.invoker_model_name_or_path) + invoker_pipeline: InvokerPipeline = await get_pipeline( + model_path=settings.invoker_model_name_or_path, model_type=settings.invoker_model_type + ) prompt = invoker_pipeline.format_message(messages=req.messages, functions=req.functions) choices = invoker_pipeline.generate(input_text=prompt, params={"temperature": req.temperature, "top_p": req.top_p}) created = int(time.time()) @@ -33,4 +37,4 @@ async def chat(req: ChatInput): @app.on_event("startup") async def startup(): - _ = await get_pipeline(model_path=settings.invoker_model_name_or_path) + _ = await get_pipeline(model_path=settings.invoker_model_name_or_path, model_type=settings.invoker_model_type)