From 6d72097077dda2045d36e7d0a30be5a71d10b0ae Mon Sep 17 00:00:00 2001 From: Yizhong Zhang Date: Wed, 7 Feb 2024 10:11:55 +0800 Subject: [PATCH] [Inference] Fix auth token and add models starcoder and llama2 (#39) * add starcoder and enable llama2 * nit * nit * revert * add token * dedup * add token to from_pretrained * pass auth token to from_pretrained * nit * add auth tokens * lint * fix lint * nit * deepspeed not support starcoder * nit * remove from ci * remove direct auth token * add back ci workflow temporarily * remove from ci * add load environment and enable 2 models again * add dir * add load environment and enable 2 models again * change proxy * revert proxy * change proxy * revert proxy * remove 2 models from ci --------- Signed-off-by: Yizhong Zhang --- .github/workflows/workflow_inference.yml | 29 +++++++++++++++++++++--- inference/deepspeed_predictor.py | 7 +++++- inference/models/llama-2-7b-chat-hf.yaml | 2 +- inference/models/starcoder.yaml | 22 ++++++++++++++++++ inference/predictor.py | 3 ++- inference/transformer_predictor.py | 7 +++++- 6 files changed, 63 insertions(+), 7 deletions(-) create mode 100644 inference/models/starcoder.yaml diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml index 03269a4a4..6a5617a66 100644 --- a/.github/workflows/workflow_inference.yml +++ b/.github/workflows/workflow_inference.yml @@ -34,7 +34,7 @@ jobs: name: inference test strategy: matrix: - model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-bigdl, neural-chat-7b-v3-1, CodeLlama-7b-hf, falcon-7b, llama-2-7b-chat-hf-vllm ] + model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-bigdl, neural-chat-7b-v3-1, CodeLlama-7b-hf, falcon-7b, starcoder, llama-2-7b-chat-hf, llama-2-7b-chat-hf-vllm ] isPR: - ${{inputs.ci_type == 'pr'}} @@ -61,11 +61,15 @@ jobs: https_proxy: ${{ inputs.https_proxy }} volumes: - /var/run/docker.sock:/var/run/docker.sock + - ${{ inputs.runner_config_path }}:/root/actions-runner-config steps: - name: Checkout uses: actions/checkout@v2 + - name: Load environment variables + run: cat /root/actions-runner-config/.env >> $GITHUB_ENV + - name: Determine Target id: "target" run: | @@ -111,6 +115,25 @@ jobs: - name: Run Inference Test run: | TARGET=${{steps.target.outputs.target}} + CMD=$(cat << EOF + import yaml + if ("${{ matrix.model }}" == "starcoder"): + conf_path = "inference/models/starcoder.yaml" + with open(conf_path, encoding="utf-8") as reader: + result = yaml.load(reader, Loader=yaml.FullLoader) + result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}" + with open(conf_path, 'w') as output: + yaml.dump(result, output, sort_keys=False) + if ("${{ matrix.model }}" == "llama-2-7b-chat-hf"): + conf_path = "inference/models/llama-2-7b-chat-hf.yaml" + with open(conf_path, encoding="utf-8") as reader: + result = yaml.load(reader, Loader=yaml.FullLoader) + result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}" + with open(conf_path, 'w') as output: + yaml.dump(result, output, sort_keys=False) + EOF + ) + docker exec "${TARGET}" python -c "$CMD" if [[ ${{ matrix.model }} == "mpt-7b-bigdl" ]]; then docker exec "${TARGET}" bash -c "python inference/serve.py --config_file inference/models/bigdl/mpt-7b-bigdl.yaml --simple" elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then @@ -134,7 +157,7 @@ jobs: - name: Run Inference Test with DeepSpeed run: | TARGET=${{steps.target.outputs.target}} - if [[ ${{ matrix.model }} =~ ^(gpt2|falcon-7b|mpt-7b.*)$ ]]; then + if [[ ${{ matrix.model }} =~ ^(gpt2|falcon-7b|starcoder|mpt-7b.*)$ ]]; then echo ${{ matrix.model }} is not supported! elif [[ ! ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then docker exec "${TARGET}" bash -c "python .github/workflows/config/update_inference_config.py --config_file inference/models/\"${{ matrix.model }}\".yaml --output_file \"${{ matrix.model }}\".yaml.deepspeed --deepspeed" @@ -147,7 +170,7 @@ jobs: if: ${{ matrix.dtuner_model }} run: | TARGET=${{steps.target.outputs.target}} - if [[ ${{ matrix.model }} =~ ^(gpt2|mpt-7b.*)$ ]]; then + if [[ ${{ matrix.model }} =~ ^(gpt2|falcon-7b|starcoder|mpt-7b.*)$ ]]; then echo ${{ matrix.model }} is not supported! else docker exec "${TARGET}" bash -c "python inference/serve.py --config_file .github/workflows/config/mpt_deltatuner_deepspeed.yaml --simple" diff --git a/inference/deepspeed_predictor.py b/inference/deepspeed_predictor.py index 2d02b4f56..464c81506 100644 --- a/inference/deepspeed_predictor.py +++ b/inference/deepspeed_predictor.py @@ -35,6 +35,7 @@ def __init__(self, infer_conf: InferenceConfig, pad_token_id, stopping_criteria) model_desc.model_id_or_path, torchscript=True, trust_remote_code=model_config.trust_remote_code, + use_auth_token=infer_conf.model_description.config.use_auth_token, ) # get correct torch type for loading HF model @@ -50,7 +51,11 @@ def __init__(self, infer_conf: InferenceConfig, pad_token_id, stopping_criteria) if model_desc.peft_model_id_or_path: from peft import PeftModel - self.model = PeftModel.from_pretrained(self.model, model_desc.peft_model_id_or_path) + self.model = PeftModel.from_pretrained( + self.model, + model_desc.peft_model_id_or_path, + use_auth_token=infer_conf.model_description.config.use_auth_token, + ) if model_desc.peft_type == "deltatuner": from deltatuner import DeltaTunerModel diff --git a/inference/models/llama-2-7b-chat-hf.yaml b/inference/models/llama-2-7b-chat-hf.yaml index b0dc029da..168981aaa 100644 --- a/inference/models/llama-2-7b-chat-hf.yaml +++ b/inference/models/llama-2-7b-chat-hf.yaml @@ -7,7 +7,7 @@ deepspeed: false workers_per_group: 2 device: "cpu" ipex: - enabled: true + enabled: false precision: bf16 model_description: model_id_or_path: meta-llama/Llama-2-7b-chat-hf diff --git a/inference/models/starcoder.yaml b/inference/models/starcoder.yaml new file mode 100644 index 000000000..5c23ba043 --- /dev/null +++ b/inference/models/starcoder.yaml @@ -0,0 +1,22 @@ +port: 8000 +name: starcoder +route_prefix: /starcoder +cpus_per_worker: 24 +gpus_per_worker: 0 +deepspeed: false +workers_per_group: 2 +ipex: + enabled: false + precision: bf16 +device: "cpu" +model_description: + model_id_or_path: bigcode/starcoder + tokenizer_name_or_path: bigcode/starcoder + chat_processor: ChatModelGptJ + prompt: + intro: '' + human_id: '' + bot_id: '' + stop_words: [] + config: + use_auth_token: '' diff --git a/inference/predictor.py b/inference/predictor.py index c8c9487bc..a69a9407e 100644 --- a/inference/predictor.py +++ b/inference/predictor.py @@ -10,7 +10,8 @@ class Predictor: def __init__(self, infer_conf: InferenceConfig) -> None: self.infer_conf = infer_conf self.tokenizer = AutoTokenizer.from_pretrained( - infer_conf.model_description.tokenizer_name_or_path + infer_conf.model_description.tokenizer_name_or_path, + **infer_conf.model_description.config.dict(), ) self.device = torch.device(infer_conf.device) # now deepspeed predictor don't have the model diff --git a/inference/transformer_predictor.py b/inference/transformer_predictor.py index deaf0a189..c1e83e432 100644 --- a/inference/transformer_predictor.py +++ b/inference/transformer_predictor.py @@ -15,6 +15,7 @@ def __init__(self, infer_conf: InferenceConfig): model_desc.model_id_or_path, torchscript=True, trust_remote_code=model_config.trust_remote_code, + use_auth_token=infer_conf.model_description.config.use_auth_token, ) if self.device.type == "hpu": @@ -52,7 +53,11 @@ def __init__(self, infer_conf: InferenceConfig): if model_desc.peft_model_id_or_path: from peft import PeftModel - model = PeftModel.from_pretrained(model, model_desc.peft_model_id_or_path) + model = PeftModel.from_pretrained( + model, + model_desc.peft_model_id_or_path, + use_auth_token=infer_conf.model_description.config.use_auth_token, + ) if model_desc.peft_type == "deltatuner": from deltatuner import DeltaTunerModel