From 923a5b8d5f9d953bc2d028945b5f93b6be496c3e Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Wed, 27 Sep 2023 08:37:12 +0000
Subject: [PATCH 1/3] add exllamav2 integration

---
 .pre-commit-config.yaml |  1 +
 invoker/model.py        | 31 +++++++++++++++++++++++++++++--
 requirements.txt        |  3 ++-
 server_fastapi.py       |  7 ++++++-
 4 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 22159ae..2b7f490 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -14,6 +14,7 @@ repos:
     hooks:
       - id: isort
         name: Format imports
+        args: ["--profile", "black", "--filter-files"]
 -   repo: https://github.com/PyCQA/flake8
     rev: 6.1.0
     hooks:
diff --git a/invoker/model.py b/invoker/model.py
index d02c285..fda6ae5 100644
--- a/invoker/model.py
+++ b/invoker/model.py
@@ -5,6 +5,8 @@
 from typing import Any, Dict, List, Optional
 
 import torch
+from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Config, ExLlamaV2Tokenizer
+from exllamav2.generator import ExLlamaV2BaseGenerator, ExLlamaV2Sampler
 from transformers import LlamaForCausalLM, LlamaTokenizer
 
 from invoker.api_types import Function, Message
@@ -15,9 +17,22 @@ class InvokerPipeline:
     _pipeline = None
 
     def __init__(self, model_path: str):
-        self._tokenizer = LlamaTokenizer.from_pretrained(model_path, use_fast=False)
         # Load model
-        self._model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto")
+        if "GPTQ" in model_path:
+            config = ExLlamaV2Config()
+            config.model_dir = model_path
+            config.prepare()
+
+            model = ExLlamaV2(config)
+            model.load()
+            self._tokenizer = ExLlamaV2Tokenizer(config)
+
+            cache = ExLlamaV2Cache(model)
+            self.generator = ExLlamaV2BaseGenerator(model, cache, self._tokenizer)
+            self.generator.warmup()
+        else:
+            self._tokenizer = LlamaTokenizer.from_pretrained(model_path, use_fast=False)
+            self._model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto")
 
     def format_message(self, messages: List[Message], functions: Optional[List[Function]]):
         prompt = "Available Functions:"
@@ -72,6 +87,18 @@ def generate(self, input_text: str, params: Dict[str, Any]) -> str:
         choices = self._postprocess(text=output)
         return choices
 
+    def generate_exllama(self, input_text: str, params: Dict[str, Any]) -> str:
+        temperature, top_p = params.get("temperature"), params.get("top_p")
+        settings = ExLlamaV2Sampler.Settings()
+        settings.token_repetition_penalty = 1.0
+        settings.temperature = temperature
+        settings.top_p = top_p
+        raw_output = self.generator.generate_simple(input_text, settings, num_tokens=512)
+        breakpoint()
+        output = raw_output[len(input_text) :]
+        choices = self._postprocess(text=output)
+        return choices
+
     def _postprocess(self, text):
         output_json = json.loads(re.search(r"```(.*?)```?", text, re.DOTALL).group(1))
         if output_json["function_call"] is not None:
diff --git a/requirements.txt b/requirements.txt
index d534742..ecc2adb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,4 +6,5 @@ accelerate==0.22.0
 fastapi==0.103.0
 uvicorn==0.23.2
 pydantic-settings==2.0.3
-openai==0.28.0
\ No newline at end of file
+openai==0.28.0
+exllamav2==0.0.3
\ No newline at end of file
diff --git a/server_fastapi.py b/server_fastapi.py
index f2eca04..f763afb 100644
--- a/server_fastapi.py
+++ b/server_fastapi.py
@@ -26,7 +26,12 @@ async def chat(req: ChatInput):
     id = str(uuid.uuid4())
     invoker_pipeline: InvokerPipeline = await get_pipeline(model_path=settings.invoker_model_name_or_path)
     prompt = invoker_pipeline.format_message(messages=req.messages, functions=req.functions)
-    choices = invoker_pipeline.generate(input_text=prompt, params={"temperature": req.temperature, "top_p": req.top_p})
+    # choices = invoker_pipeline.generate(
+    #     input_text=prompt, params={"temperature": req.temperature, "top_p": req.top_p}
+    # )
+    choices = invoker_pipeline.generate_exllama(
+        input_text=prompt, params={"temperature": req.temperature, "top_p": req.top_p}
+    )
     created = int(time.time())
     return {"id": id, "created": created, "choices": choices}
 

From 0cb801d65ec52cc7e3b8fbe4b896a93b01845597 Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Wed, 27 Sep 2023 15:01:51 +0000
Subject: [PATCH 2/3] refactor exllamav2 integration

---
 invoker/api_types.py       |  2 +-
 invoker/model.py           | 56 ++++++++++++++++----------------------
 invoker/utils/enum_tags.py |  6 ++++
 server_fastapi.py          | 23 ++++++++--------
 4 files changed, 42 insertions(+), 45 deletions(-)
 create mode 100644 invoker/utils/enum_tags.py

diff --git a/invoker/api_types.py b/invoker/api_types.py
index fb524b3..35e53f0 100644
--- a/invoker/api_types.py
+++ b/invoker/api_types.py
@@ -31,7 +31,7 @@ class ChatInput(BaseModel):
     model: str
     messages: List[Message]
     functions: Optional[List[Function]] = None
-    temperature: float = 0.7
+    temperature: float = 0.5
     top_p: float = 1.0
 
 
diff --git a/invoker/model.py b/invoker/model.py
index fda6ae5..ac84ecd 100644
--- a/invoker/model.py
+++ b/invoker/model.py
@@ -10,26 +10,28 @@
 from transformers import LlamaForCausalLM, LlamaTokenizer
 
 from invoker.api_types import Function, Message
+from invoker.utils.enum_tags import ModelType
 
 
 class InvokerPipeline:
     # Singleton instance
     _pipeline = None
 
-    def __init__(self, model_path: str):
+    def __init__(self, model_path: str, model_type: ModelType):
         # Load model
-        if "GPTQ" in model_path:
+        self._model_type = model_type
+        if model_type == ModelType.exllamav2:
             config = ExLlamaV2Config()
             config.model_dir = model_path
             config.prepare()
-
             model = ExLlamaV2(config)
             model.load()
             self._tokenizer = ExLlamaV2Tokenizer(config)
-
             cache = ExLlamaV2Cache(model)
-            self.generator = ExLlamaV2BaseGenerator(model, cache, self._tokenizer)
-            self.generator.warmup()
+            self._generator = ExLlamaV2BaseGenerator(model, cache, self._tokenizer)
+            self._generator.warmup()
+            self._settings = ExLlamaV2Sampler.Settings()
+            self._settings.token_repetition_penalty = 1.0
         else:
             self._tokenizer = LlamaTokenizer.from_pretrained(model_path, use_fast=False)
             self._model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto")
@@ -70,31 +72,21 @@ def format_message(self, messages: List[Message], functions: Optional[List[Funct
         return prompt
 
     def generate(self, input_text: str, params: Dict[str, Any]) -> str:
-        # Tokenize the input
-        input_ids = self._tokenizer(input_text, return_tensors="pt").input_ids.cuda()
-        # Run the model to infer an output
         temperature, top_p = params.get("temperature"), params.get("top_p")
-        do_sample = True if temperature > 0.0 else False
-        output_ids = self._model.generate(
-            input_ids=input_ids,
-            max_new_tokens=512,
-            do_sample=do_sample,
-            top_p=top_p,
-            temperature=temperature,
-        )
-        raw_output = self._tokenizer.decode(output_ids[0], skip_special_tokens=True)
-        output = raw_output[len(input_text) :]
-        choices = self._postprocess(text=output)
-        return choices
-
-    def generate_exllama(self, input_text: str, params: Dict[str, Any]) -> str:
-        temperature, top_p = params.get("temperature"), params.get("top_p")
-        settings = ExLlamaV2Sampler.Settings()
-        settings.token_repetition_penalty = 1.0
-        settings.temperature = temperature
-        settings.top_p = top_p
-        raw_output = self.generator.generate_simple(input_text, settings, num_tokens=512)
-        breakpoint()
+        if self._model_type == ModelType.exllamav2:
+            self._settings.temperature, self._settings.top_p = temperature, top_p
+            raw_output = self._generator.generate_simple(input_text, self._settings, num_tokens=512)
+        else:
+            input_ids = self._tokenizer(input_text, return_tensors="pt").input_ids.cuda()
+            do_sample = True if temperature > 0.0 else False
+            output_ids = self._model.generate(
+                input_ids=input_ids,
+                max_new_tokens=512,
+                do_sample=do_sample,
+                top_p=top_p,
+                temperature=temperature,
+            )
+            raw_output = self._tokenizer.decode(output_ids[0], skip_special_tokens=True)
         output = raw_output[len(input_text) :]
         choices = self._postprocess(text=output)
         return choices
@@ -127,9 +119,9 @@ def _postprocess(self, text):
         return choices
 
     @classmethod
-    async def maybe_init(cls, model_path: str) -> InvokerPipeline:
+    async def maybe_init(cls, model_path: str, model_type: ModelType) -> InvokerPipeline:
         if cls._pipeline is None:
-            cls._pipeline = InvokerPipeline(model_path=model_path)
+            cls._pipeline = InvokerPipeline(model_path=model_path, model_type=model_type)
         if cls._pipeline is not None:
             return cls._pipeline
         else:
diff --git a/invoker/utils/enum_tags.py b/invoker/utils/enum_tags.py
new file mode 100644
index 0000000..e7f7535
--- /dev/null
+++ b/invoker/utils/enum_tags.py
@@ -0,0 +1,6 @@
+from enum import Enum
+
+
+class ModelType(str, Enum):
+    hf = "hf"
+    exllamav2 = "exllamav2"
diff --git a/server_fastapi.py b/server_fastapi.py
index f763afb..6068561 100644
--- a/server_fastapi.py
+++ b/server_fastapi.py
@@ -7,16 +7,18 @@
 
 from invoker.api_types import ChatInput, ChatOutput
 from invoker.model import InvokerPipeline
-
-
-async def get_pipeline(model_path: str):
-    return await InvokerPipeline.maybe_init(model_path=model_path)
+from invoker.utils.enum_tags import ModelType
 
 
 class Settings(BaseSettings):
+    invoker_model_type: ModelType = Field("hf", env="INVOKER_MODEL_TYPE")
     invoker_model_name_or_path: str = Field("jeffrey-fong/invoker-13b", env="INVOKER_MODEL_NAME_OR_PATH")
 
 
+async def get_pipeline(model_path: str, model_type: ModelType):
+    return await InvokerPipeline.maybe_init(model_path=model_path, model_type=model_type)
+
+
 app = FastAPI(title="Invoker")
 settings = Settings()
 
@@ -24,18 +26,15 @@ class Settings(BaseSettings):
 @app.post("/chat/completions", response_model=ChatOutput)
 async def chat(req: ChatInput):
     id = str(uuid.uuid4())
-    invoker_pipeline: InvokerPipeline = await get_pipeline(model_path=settings.invoker_model_name_or_path)
-    prompt = invoker_pipeline.format_message(messages=req.messages, functions=req.functions)
-    # choices = invoker_pipeline.generate(
-    #     input_text=prompt, params={"temperature": req.temperature, "top_p": req.top_p}
-    # )
-    choices = invoker_pipeline.generate_exllama(
-        input_text=prompt, params={"temperature": req.temperature, "top_p": req.top_p}
+    invoker_pipeline: InvokerPipeline = await get_pipeline(
+        model_path=settings.invoker_model_name_or_path, model_type=settings.invoker_model_type
     )
+    prompt = invoker_pipeline.format_message(messages=req.messages, functions=req.functions)
+    choices = invoker_pipeline.generate(input_text=prompt, params={"temperature": req.temperature, "top_p": req.top_p})
     created = int(time.time())
     return {"id": id, "created": created, "choices": choices}
 
 
 @app.on_event("startup")
 async def startup():
-    _ = await get_pipeline(model_path=settings.invoker_model_name_or_path)
+    _ = await get_pipeline(model_path=settings.invoker_model_name_or_path, model_type=settings.invoker_model_type)

From fd5546e15406e14d6a912779a4752cd37a050a6e Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Wed, 27 Sep 2023 15:25:42 +0000
Subject: [PATCH 3/3] update README

---
 README.md | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index e4cfb98..258fcdf 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,11 @@
 
 Invoker is a suite of large language models based on Llama-2 and is finetuned to plan between calling functions and providing responses directly. Currently, we have released the 13B version and there are plans for the 7B and 34B versions to be trained and released in the future.
 
+## News
+
+- [2023/09] We released **Invoker-13B-GPTQ**, which is a 4-bit quantized GPTQ implementation of Invoker-13B. Download [weights](https://huggingface.co/jeffrey-fong/invoker-13b-GPTQ). We also added ExllamaV2 integration!
+- [2023/09] We released **Invoker-13B**, a model trained on function-calling and multi-turn conversation datasets. Download [weights](https://huggingface.co/jeffrey-fong/invoker-13b)
+
 ## Installation & Usage
 
 The usage of Invoker follows exactly like OpenAI's function calling. Simply install the required dependencies:
@@ -20,20 +25,29 @@ pip install -r requirements.txt
 
 #### Launching the Server
 
-Kick-start the FastAPI server. You can indicate the model via environment variables. The list of models are indicated [here](#download).
+Kick-start the FastAPI server. You can indicate the model details via environment variables. The Invoker server currently supports 2 different ways to load the model. If you would like to load the full fp16 model using HuggingFace transformers, run the following commands:
 
 ```shell
+EXPORT INVOKER_MODEL_TYPE=hf
 EXPORT INVOKER_MODEL_NAME_OR_PATH=jeffrey-fong/invoker-13b
 uvicorn server_fastapi:app
 ```
 
-There are plans to set up accelerated servers based on [ExLlama](https://github.com/turboderp/exllama) and/or [ExLlamaV2](https://github.com/turboderp/exllamav2) that can work with GPTQ-based models. Stay tuned for more updates!
+If you would like to load 4-bit quantized Invoker GPTQ models, using [ExLlamaV2](https://github.com/turboderp/exllamav2), run the following commands:
+
+```shell
+EXPORT INVOKER_MODEL_TYPE=exllamav2
+EXPORT INVOKER_MODEL_NAME_OR_PATH=jeffrey-fong/invoker-13b-GPTQ
+uvicorn server_fastapi:app
+```
+
+The full list of models are indicated [here](#download).
 
 #### Inference
 
 Inference can then be performed exactly like OpenAI function-calling. Provide the chat and the functions in the `messages` and `functions` arguments respectively. Invoker also supports the following generation hyperparameters:
 
-- `temperature: float = 0.7` Accepts values between 0.0 and 1.0. Defaults to 0.7 if the temperature is not passed in.
+- `temperature: float = 0.5` Accepts values between 0.0 and 1.0. Defaults to 0.5 if the temperature is not passed in.
 - `top_p: float = 1.0` Accepts values between 0.0 and 1.0. Defaults to 1.0 if the top_p is not passed in.
 
 ```python
@@ -108,7 +122,7 @@ Please refer to the model card in HuggingFace to see how to use the model direct
 | Model  |  Link | Version |
 | ------------- | ------------- |------------- |
 | Invoker-13B  | [Huggingface Repo](https://huggingface.co/jeffrey-fong/invoker-13b) |v1.0|
-| Invoker-13B-GPTQ  | Coming Soon |v1.0|
+| Invoker-13B-GPTQ  | [Huggingface Repo](https://huggingface.co/jeffrey-fong/invoker-13b-GPTQ) |v1.0|
 | Invoker-7B  | Coming Soon |v1.0|
 | Invoker-34B  | Coming Soon |v1.0|
 
@@ -153,9 +167,9 @@ All the datasets used are under Apache-2.0 License. Therefore, this dataset will
 
 ## To-Dos
 
+- [X] Quantize 13B model
+- [X] Work on GPTQ-based servers ([ExLlama](https://github.com/turboderp/exllama) and/or [ExLlamaV2](https://github.com/turboderp/exllamav2))
 - [ ] Work on validating function names, descriptions, etc. Just like OpenAI's function calling
-- [ ] Quantize 13B model
-- [ ] Work on GPTQ-based servers ([ExLlama](https://github.com/turboderp/exllama) and/or [ExLlamaV2](https://github.com/turboderp/exllamav2))
 - [ ] Converting Invoker to other formats like:
   - [ ] GGUF
   - [ ] AWQ