jeffrey-fong · jeffrey-fong · Sep 27, 2023 · Sep 27, 2023 · Sep 27, 2023 · Sep 27, 2023
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -14,6 +14,7 @@ repos:
     hooks:
       - id: isort
         name: Format imports
+        args: ["--profile", "black", "--filter-files"]
 -   repo: https://github.com/PyCQA/flake8
     rev: 6.1.0
     hooks:

diff --git a/README.md b/README.md
@@ -10,6 +10,11 @@
 
 Invoker is a suite of large language models based on Llama-2 and is finetuned to plan between calling functions and providing responses directly. Currently, we have released the 13B version and there are plans for the 7B and 34B versions to be trained and released in the future.
 
+## News
+
+- [2023/09] We released **Invoker-13B-GPTQ**, which is a 4-bit quantized GPTQ implementation of Invoker-13B. Download [weights](https://huggingface.co/jeffrey-fong/invoker-13b-GPTQ). We also added ExllamaV2 integration!
+- [2023/09] We released **Invoker-13B**, a model trained on function-calling and multi-turn conversation datasets. Download [weights](https://huggingface.co/jeffrey-fong/invoker-13b)
+
 ## Installation & Usage
 
 The usage of Invoker follows exactly like OpenAI's function calling. Simply install the required dependencies:
@@ -20,20 +25,29 @@ pip install -r requirements.txt
 
 #### Launching the Server
 
-Kick-start the FastAPI server. You can indicate the model via environment variables. The list of models are indicated [here](#download).
+Kick-start the FastAPI server. You can indicate the model details via environment variables. The Invoker server currently supports 2 different ways to load the model. If you would like to load the full fp16 model using HuggingFace transformers, run the following commands:
 
 ```shell
+EXPORT INVOKER_MODEL_TYPE=hf
 EXPORT INVOKER_MODEL_NAME_OR_PATH=jeffrey-fong/invoker-13b
 uvicorn server_fastapi:app
 ```
 
-There are plans to set up accelerated servers based on [ExLlama](https://github.com/turboderp/exllama) and/or [ExLlamaV2](https://github.com/turboderp/exllamav2) that can work with GPTQ-based models. Stay tuned for more updates!
+If you would like to load 4-bit quantized Invoker GPTQ models, using [ExLlamaV2](https://github.com/turboderp/exllamav2), run the following commands:
+
+```shell
+EXPORT INVOKER_MODEL_TYPE=exllamav2
+EXPORT INVOKER_MODEL_NAME_OR_PATH=jeffrey-fong/invoker-13b-GPTQ
+uvicorn server_fastapi:app
+```
+
+The full list of models are indicated [here](#download).
 
 #### Inference
 
 Inference can then be performed exactly like OpenAI function-calling. Provide the chat and the functions in the `messages` and `functions` arguments respectively. Invoker also supports the following generation hyperparameters:
 
-- `temperature: float = 0.7` Accepts values between 0.0 and 1.0. Defaults to 0.7 if the temperature is not passed in.
+- `temperature: float = 0.5` Accepts values between 0.0 and 1.0. Defaults to 0.5 if the temperature is not passed in.
 - `top_p: float = 1.0` Accepts values between 0.0 and 1.0. Defaults to 1.0 if the top_p is not passed in.
 
 ```python
@@ -108,7 +122,7 @@ Please refer to the model card in HuggingFace to see how to use the model direct
 | Model  |  Link | Version |
 | ------------- | ------------- |------------- |
 | Invoker-13B  | [Huggingface Repo](https://huggingface.co/jeffrey-fong/invoker-13b) |v1.0|
-| Invoker-13B-GPTQ  | Coming Soon |v1.0|
+| Invoker-13B-GPTQ  | [Huggingface Repo](https://huggingface.co/jeffrey-fong/invoker-13b-GPTQ) |v1.0|
 | Invoker-7B  | Coming Soon |v1.0|
 | Invoker-34B  | Coming Soon |v1.0|
 
@@ -153,9 +167,9 @@ All the datasets used are under Apache-2.0 License. Therefore, this dataset will
 
 ## To-Dos
 
+- [X] Quantize 13B model
+- [X] Work on GPTQ-based servers ([ExLlama](https://github.com/turboderp/exllama) and/or [ExLlamaV2](https://github.com/turboderp/exllamav2))
 - [ ] Work on validating function names, descriptions, etc. Just like OpenAI's function calling
-- [ ] Quantize 13B model
-- [ ] Work on GPTQ-based servers ([ExLlama](https://github.com/turboderp/exllama) and/or [ExLlamaV2](https://github.com/turboderp/exllamav2))
 - [ ] Converting Invoker to other formats like:
   - [ ] GGUF
   - [ ] AWQ

diff --git a/invoker/api_types.py b/invoker/api_types.py
@@ -31,7 +31,7 @@ class ChatInput(BaseModel):
     model: str
     messages: List[Message]
     functions: Optional[List[Function]] = None
-    temperature: float = 0.7
+    temperature: float = 0.5
     top_p: float = 1.0
 
 

diff --git a/invoker/model.py b/invoker/model.py
@@ -5,19 +5,36 @@
 from typing import Any, Dict, List, Optional
 
 import torch
+from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Config, ExLlamaV2Tokenizer
+from exllamav2.generator import ExLlamaV2BaseGenerator, ExLlamaV2Sampler
 from transformers import LlamaForCausalLM, LlamaTokenizer
 
 from invoker.api_types import Function, Message
+from invoker.utils.enum_tags import ModelType
 
 
 class InvokerPipeline:
     # Singleton instance
     _pipeline = None
 
-    def __init__(self, model_path: str):
-        self._tokenizer = LlamaTokenizer.from_pretrained(model_path, use_fast=False)
+    def __init__(self, model_path: str, model_type: ModelType):
         # Load model
-        self._model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto")
+        self._model_type = model_type
+        if model_type == ModelType.exllamav2:
+            config = ExLlamaV2Config()
+            config.model_dir = model_path
+            config.prepare()
+            model = ExLlamaV2(config)
+            model.load()
+            self._tokenizer = ExLlamaV2Tokenizer(config)
+            cache = ExLlamaV2Cache(model)
+            self._generator = ExLlamaV2BaseGenerator(model, cache, self._tokenizer)
+            self._generator.warmup()
+            self._settings = ExLlamaV2Sampler.Settings()
+            self._settings.token_repetition_penalty = 1.0
+        else:
+            self._tokenizer = LlamaTokenizer.from_pretrained(model_path, use_fast=False)
+            self._model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto")
 
     def format_message(self, messages: List[Message], functions: Optional[List[Function]]):
         prompt = "Available Functions:"
@@ -55,19 +72,21 @@ def format_message(self, messages: List[Message], functions: Optional[List[Funct
         return prompt
 
     def generate(self, input_text: str, params: Dict[str, Any]) -> str:
-        # Tokenize the input
-        input_ids = self._tokenizer(input_text, return_tensors="pt").input_ids.cuda()
-        # Run the model to infer an output
         temperature, top_p = params.get("temperature"), params.get("top_p")
-        do_sample = True if temperature > 0.0 else False
-        output_ids = self._model.generate(
-            input_ids=input_ids,
-            max_new_tokens=512,
-            do_sample=do_sample,
-            top_p=top_p,
-            temperature=temperature,
-        )
-        raw_output = self._tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        if self._model_type == ModelType.exllamav2:
+            self._settings.temperature, self._settings.top_p = temperature, top_p
+            raw_output = self._generator.generate_simple(input_text, self._settings, num_tokens=512)
+        else:
+            input_ids = self._tokenizer(input_text, return_tensors="pt").input_ids.cuda()
+            do_sample = True if temperature > 0.0 else False
+            output_ids = self._model.generate(
+                input_ids=input_ids,
+                max_new_tokens=512,
+                do_sample=do_sample,
+                top_p=top_p,
+                temperature=temperature,
+            )
+            raw_output = self._tokenizer.decode(output_ids[0], skip_special_tokens=True)
         output = raw_output[len(input_text) :]
         choices = self._postprocess(text=output)
         return choices
@@ -100,9 +119,9 @@ def _postprocess(self, text):
         return choices
 
     @classmethod
-    async def maybe_init(cls, model_path: str) -> InvokerPipeline:
+    async def maybe_init(cls, model_path: str, model_type: ModelType) -> InvokerPipeline:
         if cls._pipeline is None:
-            cls._pipeline = InvokerPipeline(model_path=model_path)
+            cls._pipeline = InvokerPipeline(model_path=model_path, model_type=model_type)
         if cls._pipeline is not None:
             return cls._pipeline
         else:

diff --git a/invoker/utils/enum_tags.py b/invoker/utils/enum_tags.py
@@ -0,0 +1,6 @@
+from enum import Enum
+
+
+class ModelType(str, Enum):
+    hf = "hf"
+    exllamav2 = "exllamav2"
diff --git a/requirements.txt b/requirements.txt
@@ -6,4 +6,5 @@ accelerate==0.22.0
 fastapi==0.103.0
 uvicorn==0.23.2
 pydantic-settings==2.0.3
-openai==0.28.0
+openai==0.28.0
+exllamav2==0.0.3
diff --git a/server_fastapi.py b/server_fastapi.py
@@ -7,24 +7,28 @@
 
 from invoker.api_types import ChatInput, ChatOutput
 from invoker.model import InvokerPipeline
-
-
-async def get_pipeline(model_path: str):
-    return await InvokerPipeline.maybe_init(model_path=model_path)
+from invoker.utils.enum_tags import ModelType
 
 
 class Settings(BaseSettings):
+    invoker_model_type: ModelType = Field("hf", env="INVOKER_MODEL_TYPE")
     invoker_model_name_or_path: str = Field("jeffrey-fong/invoker-13b", env="INVOKER_MODEL_NAME_OR_PATH")
 
 
+async def get_pipeline(model_path: str, model_type: ModelType):
+    return await InvokerPipeline.maybe_init(model_path=model_path, model_type=model_type)
+
+
 app = FastAPI(title="Invoker")
 settings = Settings()
 
 
 @app.post("/chat/completions", response_model=ChatOutput)
 async def chat(req: ChatInput):
     id = str(uuid.uuid4())
-    invoker_pipeline: InvokerPipeline = await get_pipeline(model_path=settings.invoker_model_name_or_path)
+    invoker_pipeline: InvokerPipeline = await get_pipeline(
+        model_path=settings.invoker_model_name_or_path, model_type=settings.invoker_model_type
+    )
     prompt = invoker_pipeline.format_message(messages=req.messages, functions=req.functions)
     choices = invoker_pipeline.generate(input_text=prompt, params={"temperature": req.temperature, "top_p": req.top_p})
     created = int(time.time())
@@ -33,4 +37,4 @@ async def chat(req: ChatInput):
 
 @app.on_event("startup")
 async def startup():
-    _ = await get_pipeline(model_path=settings.invoker_model_name_or_path)
+    _ = await get_pipeline(model_path=settings.invoker_model_name_or_path, model_type=settings.invoker_model_type)