From 6d72097077dda2045d36e7d0a30be5a71d10b0ae Mon Sep 17 00:00:00 2001
From: Yizhong Zhang <zyzzxycj@163.com>
Date: Wed, 7 Feb 2024 10:11:55 +0800
Subject: [PATCH] [Inference] Fix auth token and add models starcoder and
 llama2 (#39)

* add starcoder and enable llama2

* nit

* nit

* revert

* add token

* dedup

* add token to from_pretrained

* pass auth token to from_pretrained

* nit

* add auth tokens

* lint

* fix lint

* nit

* deepspeed not support starcoder

* nit

* remove from ci

* remove direct auth token

* add back ci workflow temporarily

* remove from ci

* add load environment and enable 2 models again

* add dir

* add load environment and enable 2 models again

* change proxy

* revert proxy

* change proxy

* revert proxy

* remove 2 models from ci

---------

Signed-off-by: Yizhong Zhang <zyzzxycj@163.com>
---
 .github/workflows/workflow_inference.yml | 29 +++++++++++++++++++++---
 inference/deepspeed_predictor.py         |  7 +++++-
 inference/models/llama-2-7b-chat-hf.yaml |  2 +-
 inference/models/starcoder.yaml          | 22 ++++++++++++++++++
 inference/predictor.py                   |  3 ++-
 inference/transformer_predictor.py       |  7 +++++-
 6 files changed, 63 insertions(+), 7 deletions(-)
 create mode 100644 inference/models/starcoder.yaml

diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml
index 03269a4a4..6a5617a66 100644
--- a/.github/workflows/workflow_inference.yml
+++ b/.github/workflows/workflow_inference.yml
@@ -34,7 +34,7 @@ jobs:
     name: inference test
     strategy:
       matrix:
-        model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-bigdl, neural-chat-7b-v3-1, CodeLlama-7b-hf, falcon-7b, llama-2-7b-chat-hf-vllm ]
+        model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-bigdl, neural-chat-7b-v3-1, CodeLlama-7b-hf, falcon-7b, starcoder, llama-2-7b-chat-hf, llama-2-7b-chat-hf-vllm ]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
 
@@ -61,11 +61,15 @@ jobs:
         https_proxy: ${{ inputs.https_proxy }}
       volumes:
         - /var/run/docker.sock:/var/run/docker.sock
+        - ${{ inputs.runner_config_path }}:/root/actions-runner-config
 
     steps:
       - name: Checkout
         uses: actions/checkout@v2
 
+      - name: Load environment variables
+        run: cat /root/actions-runner-config/.env >> $GITHUB_ENV 
+
       - name: Determine Target
         id: "target"
         run: |
@@ -111,6 +115,25 @@ jobs:
       - name: Run Inference Test
         run: |
           TARGET=${{steps.target.outputs.target}}
+          CMD=$(cat << EOF
+          import yaml
+          if ("${{ matrix.model }}" == "starcoder"):
+              conf_path = "inference/models/starcoder.yaml"
+              with open(conf_path, encoding="utf-8") as reader:
+                  result = yaml.load(reader, Loader=yaml.FullLoader)
+                  result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
+              with open(conf_path, 'w') as output:
+                  yaml.dump(result, output, sort_keys=False)
+          if ("${{ matrix.model }}" == "llama-2-7b-chat-hf"):
+              conf_path = "inference/models/llama-2-7b-chat-hf.yaml"
+              with open(conf_path, encoding="utf-8") as reader:
+                  result = yaml.load(reader, Loader=yaml.FullLoader)
+                  result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
+              with open(conf_path, 'w') as output:
+                  yaml.dump(result, output, sort_keys=False)
+          EOF
+          )
+          docker exec "${TARGET}" python -c "$CMD"
           if [[ ${{ matrix.model }} == "mpt-7b-bigdl" ]]; then
             docker exec "${TARGET}" bash -c "python inference/serve.py --config_file inference/models/bigdl/mpt-7b-bigdl.yaml --simple"
           elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
@@ -134,7 +157,7 @@ jobs:
       - name: Run Inference Test with DeepSpeed
         run: |
           TARGET=${{steps.target.outputs.target}}
-          if [[ ${{ matrix.model }} =~ ^(gpt2|falcon-7b|mpt-7b.*)$ ]]; then
+          if [[ ${{ matrix.model }} =~ ^(gpt2|falcon-7b|starcoder|mpt-7b.*)$ ]]; then
             echo ${{ matrix.model }} is not supported!
           elif [[ ! ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
             docker exec "${TARGET}" bash -c "python .github/workflows/config/update_inference_config.py --config_file inference/models/\"${{ matrix.model }}\".yaml --output_file \"${{ matrix.model }}\".yaml.deepspeed --deepspeed"
@@ -147,7 +170,7 @@ jobs:
         if: ${{ matrix.dtuner_model }}
         run: |
           TARGET=${{steps.target.outputs.target}}
-          if [[ ${{ matrix.model }} =~ ^(gpt2|mpt-7b.*)$ ]]; then
+          if [[ ${{ matrix.model }} =~ ^(gpt2|falcon-7b|starcoder|mpt-7b.*)$ ]]; then
             echo ${{ matrix.model }} is not supported!
           else
             docker exec "${TARGET}" bash -c "python inference/serve.py --config_file .github/workflows/config/mpt_deltatuner_deepspeed.yaml --simple"
diff --git a/inference/deepspeed_predictor.py b/inference/deepspeed_predictor.py
index 2d02b4f56..464c81506 100644
--- a/inference/deepspeed_predictor.py
+++ b/inference/deepspeed_predictor.py
@@ -35,6 +35,7 @@ def __init__(self, infer_conf: InferenceConfig, pad_token_id, stopping_criteria)
             model_desc.model_id_or_path,
             torchscript=True,
             trust_remote_code=model_config.trust_remote_code,
+            use_auth_token=infer_conf.model_description.config.use_auth_token,
         )
 
         # get correct torch type for loading HF model
@@ -50,7 +51,11 @@ def __init__(self, infer_conf: InferenceConfig, pad_token_id, stopping_criteria)
         if model_desc.peft_model_id_or_path:
             from peft import PeftModel
 
-            self.model = PeftModel.from_pretrained(self.model, model_desc.peft_model_id_or_path)
+            self.model = PeftModel.from_pretrained(
+                self.model,
+                model_desc.peft_model_id_or_path,
+                use_auth_token=infer_conf.model_description.config.use_auth_token,
+            )
             if model_desc.peft_type == "deltatuner":
                 from deltatuner import DeltaTunerModel
 
diff --git a/inference/models/llama-2-7b-chat-hf.yaml b/inference/models/llama-2-7b-chat-hf.yaml
index b0dc029da..168981aaa 100644
--- a/inference/models/llama-2-7b-chat-hf.yaml
+++ b/inference/models/llama-2-7b-chat-hf.yaml
@@ -7,7 +7,7 @@ deepspeed: false
 workers_per_group: 2
 device: "cpu"
 ipex:
-  enabled: true
+  enabled: false
   precision: bf16
 model_description:
   model_id_or_path: meta-llama/Llama-2-7b-chat-hf
diff --git a/inference/models/starcoder.yaml b/inference/models/starcoder.yaml
new file mode 100644
index 000000000..5c23ba043
--- /dev/null
+++ b/inference/models/starcoder.yaml
@@ -0,0 +1,22 @@
+port: 8000
+name: starcoder
+route_prefix: /starcoder
+cpus_per_worker: 24
+gpus_per_worker: 0
+deepspeed: false
+workers_per_group: 2
+ipex:
+  enabled: false
+  precision: bf16
+device: "cpu"
+model_description:  
+  model_id_or_path: bigcode/starcoder
+  tokenizer_name_or_path: bigcode/starcoder
+  chat_processor: ChatModelGptJ
+  prompt:
+    intro: ''
+    human_id: ''
+    bot_id: ''
+    stop_words: []
+  config:
+    use_auth_token: ''
diff --git a/inference/predictor.py b/inference/predictor.py
index c8c9487bc..a69a9407e 100644
--- a/inference/predictor.py
+++ b/inference/predictor.py
@@ -10,7 +10,8 @@ class Predictor:
     def __init__(self, infer_conf: InferenceConfig) -> None:
         self.infer_conf = infer_conf
         self.tokenizer = AutoTokenizer.from_pretrained(
-            infer_conf.model_description.tokenizer_name_or_path
+            infer_conf.model_description.tokenizer_name_or_path,
+            **infer_conf.model_description.config.dict(),
         )
         self.device = torch.device(infer_conf.device)
         # now deepspeed predictor don't have the model
diff --git a/inference/transformer_predictor.py b/inference/transformer_predictor.py
index deaf0a189..c1e83e432 100644
--- a/inference/transformer_predictor.py
+++ b/inference/transformer_predictor.py
@@ -15,6 +15,7 @@ def __init__(self, infer_conf: InferenceConfig):
             model_desc.model_id_or_path,
             torchscript=True,
             trust_remote_code=model_config.trust_remote_code,
+            use_auth_token=infer_conf.model_description.config.use_auth_token,
         )
 
         if self.device.type == "hpu":
@@ -52,7 +53,11 @@ def __init__(self, infer_conf: InferenceConfig):
         if model_desc.peft_model_id_or_path:
             from peft import PeftModel
 
-            model = PeftModel.from_pretrained(model, model_desc.peft_model_id_or_path)
+            model = PeftModel.from_pretrained(
+                model,
+                model_desc.peft_model_id_or_path,
+                use_auth_token=infer_conf.model_description.config.use_auth_token,
+            )
             if model_desc.peft_type == "deltatuner":
                 from deltatuner import DeltaTunerModel