support more models in finetune (intel#66)

* support more models in finetune * modify dockerfile * fix bug caused by accelerate upgrade * add llama2 * fix error * fix error * test * fix error * update
zhangjian94cn · Sep 26, 2023 · a5b8376 · a5b8376
1 parent 005b27a
commit a5b8376
Show file tree

Hide file tree

Showing 10 changed files with 56 additions and 11 deletions.
diff --git a/.github/workflows/workflow_finetune.yml b/.github/workflows/workflow_finetune.yml
@@ -8,11 +8,14 @@ jobs:
     name: finetune test
     strategy:
       matrix:
-        model: [ EleutherAI/gpt-j-6b, gpt2, bigscience/bloom-560m, facebook/opt-125m ]
+        model: [ EleutherAI/gpt-j-6b, gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b-chat, huggyllama/llama-7b, meta-llama/Llama-2-7b-chat-hf]
     runs-on: self-hosted
     steps:
       - name: Checkout
         uses: actions/checkout@v2
+
+      - name: Load environment variables
+        run: cat ~/llm-ray-actions-runner/.env >> $GITHUB_ENV
 
       - name: Build Docker Image
         run: docker build ./ --build-arg http_proxy=${{ vars.HTTP_PROXY_IMAGE_BUILD }} --build-arg https_proxy=${{ vars.HTTPS_PROXY_IMAGE_BUILD }} -f dev/docker/Dockerfile -t finetune:latest && yes | docker container prune && yes | docker image prune
@@ -31,6 +34,18 @@ jobs:
           with open(conf_path, encoding="utf-8") as reader:
               result = eval(reader.read())
               result['General']['base_model'] = "${{ matrix.model }}"
+              if "${{ matrix.model }}" == "mosaicml/mpt-7b-chat":
+                  result['General']['config']['trust_remote_code'] = True
+              else:
+                  result['General']['config']['trust_remote_code'] = False
+              if "${{ matrix.model }}" == "EleutherAI/gpt-j-6b" or "${{ matrix.model }}" == "gpt2":
+                  result['General']["gpt_base_model"] = True
+              else:
+                  result['General']["gpt_base_model"] = False
+              if "${{ matrix.model }}" == "meta-llama/Llama-2-7b-chat-hf":
+                  result['General']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
+              else:
+                  result['General']["config"]["use_auth_token"] = None
               result['Training']['epochs'] = 1
               result['Training']['num_training_workers'] = 1
           with open(conf_path, 'w') as output:

diff --git a/common/trainer/default_trainer.py b/common/trainer/default_trainer.py
@@ -101,6 +101,7 @@ def prepare(self, model, tokenizer, dataset, optimizer, accelerator):
         else:
             lr_scheduler = None
 
+        model.train()
         self.model, self.optimizer, self.lr_scheduler = accelerator.prepare(
             model, optimizer, lr_scheduler
         )

diff --git a/dev/docker/Dockerfile b/dev/docker/Dockerfile
@@ -23,8 +23,8 @@ RUN conda init bash && \
     conda config --add channels intel && \
     conda install python==3.9
 
-RUN pip install --no-cache-dir accelerate==0.19.0 datasets==2.12.0 numpy==1.21.6 ray==2.5.0  \
-    raydp==1.6.0b20230527.dev0 transformers==4.26.0 typing==3.7.4.3  \
+RUN pip install --no-cache-dir accelerate==0.21.0 datasets==2.12.0 numpy==1.21.6 ray==2.5.0  \
+    raydp==1.6.0b20230527.dev0 transformers==4.31.0 typing==3.7.4.3  \
     tabulate ray[tune] ray[serve] gradio gymnasium dm-tree scikit-image pydantic==1.10.11  \
     tensorboard einops
 

diff --git a/docs/finetune_parameters.md b/docs/finetune_parameters.md
@@ -6,9 +6,11 @@ The following are the parameters supported in the finetuning workflow.
 
 |Configuration Name| Default|Meaning|
 |-|-|-|
-|base_model| mosaicml/mpt-7b|Path to pretrained model or model identifier from huggingface.co/models|
+|base_model| EleutherAI/gpt-j-6b|Path to pretrained model or model identifier from huggingface.co/models|
+|gpt_base_model|True|This parameter is for [Transformers#22482](https://github.com/huggingface/transformers/issues/22482). It needs to be set to True when the pretrained model is realted to gpt, otherwise it is False.|
 |output_dir|/tmp/llm-ray/output|The output directory to store the finetuned model|
 |checkpoint_dir|/tmp/llm-ray/checkpoint|The directory to store checkpoint|
+|config|trust_remote_code: False<br> use_auth_token: None|Will be passed to the transformers `from_pretrained` method|
 
 
 

diff --git a/examples/finetune/dolly1/dolly_1_finetune.conf b/examples/finetune/dolly1/dolly_1_finetune.conf
@@ -1,8 +1,13 @@
 {
     "General": {
         "base_model": "EleutherAI/gpt-j-6b",
+        "gpt_base_model": True,
         "output_dir": "/tmp/llm-ray/output",
-        "checkpoint_dir": "/tmp/llm-ray/checkpoint"
+        "checkpoint_dir": "/tmp/llm-ray/checkpoint",
+        "config": {
+            "trust_remote_code": False,
+            "use_auth_token": None,
+        }
     },
     "Dataset": {
         "train_file": "examples/finetune/dolly1/data/train/train.jsonl",

diff --git a/examples/finetune/dolly2/dolly_2_finetune.conf b/examples/finetune/dolly2/dolly_2_finetune.conf
@@ -1,8 +1,13 @@
 {
     "General": {
         "base_model": "EleutherAI/pythia-6.9b",
+        "gpt_base_model": True,
         "output_dir": "/tmp/llm-ray/output",
-        "checkpoint_dir": "/tmp/llm-ray/checkpoint"
+        "checkpoint_dir": "/tmp/llm-ray/checkpoint",
+        "config": {
+            "trust_remote_code": False,
+            "use_auth_token": None,
+        }
     },
     "Dataset": {
         "train_file": "databricks/databricks-dolly-15k",

diff --git a/examples/finetune/open_assistant/open_assistant_finetune.conf b/examples/finetune/open_assistant/open_assistant_finetune.conf
@@ -1,8 +1,13 @@
 {
     "General": {
         "base_model": "EleutherAI/gpt-j-6b",
+        "gpt_base_model": True,
         "output_dir": "/tmp/llm-ray/output",
-        "checkpoint_dir": "/tmp/llm-ray/checkpoint"
+        "checkpoint_dir": "/tmp/llm-ray/checkpoint",
+        "config": {
+            "trust_remote_code": False,
+            "use_auth_token": None,
+        }
     },
     "Dataset": {
         "train_file": "examples/finetune/open_assistant/data/train/train.jsonl",

diff --git a/finetune/finetune.conf b/finetune/finetune.conf
@@ -1,8 +1,15 @@
 {
     "General": {
         "base_model": "EleutherAI/gpt-j-6b",
+        # fix issue: https://github.com/huggingface/transformers/issues/22482
+        # tranformers version 4.26.0 is required for gpt2, gpt-j-6B, pythia...
+        "gpt_base_model": True,
         "output_dir": "/tmp/llm-ray/output",
-        "checkpoint_dir": "/tmp/llm-ray/checkpoint"
+        "checkpoint_dir": "/tmp/llm-ray/checkpoint",
+        "config": {
+            "trust_remote_code": False,
+            "use_auth_token": None,
+        }
     },
     "Dataset": {
         "train_file": "examples/data/sample_finetune_data.jsonl",

diff --git a/finetune/finetune.py b/finetune/finetune.py
@@ -39,10 +39,12 @@ def train_func(config: Dict[str, Any]):
 
     tokenizer = common.tokenizer.Tokenizer.registory.get("HuggingFaceTokenizer")()(config = {
         "name": config["General"]["base_model"], 
+        "config": config["General"]["config"]
     })
 
     model = common.model.Model.registory.get("HuggingFaceModelForCausalLM")()(config = {
-        "name": config["General"]["base_model"], 
+        "name": config["General"]["base_model"],
+        "config": config["General"]["config"]
     })
 
     optimizer = common.optimizer.Optimizer.registory.get("DefaultOptimizer")()(model, config = {
@@ -104,12 +106,15 @@ def main(external_config = None):
             "env_vars": {
                 "OMP_NUM_THREADS": str(resources_per_worker["CPU"]), 
                 "ACCELERATE_USE_CPU": "True", 
+                "ACCELERATE_USE_IPEX": "False",
                 "ACCELERATE_MIXED_PRECISION": "no",
                 "CCL_WORKER_COUNT": "1",
                 "CCL_LOG_LEVEL": "info",
                 "WORLD_SIZE": str(num_training_workers),
             }
         }
+        if config["General"]["gpt_base_model"] == True:
+            runtime_env["pip"] = ["transformers==4.26.0"]
         ray.init(runtime_env = runtime_env)
 
     scaling_config = ScalingConfig(

diff --git a/requirements.txt b/requirements.txt
@@ -1,11 +1,11 @@
-accelerate==0.19.0
+accelerate==0.21.0
 datasets==2.12.0
 numpy==1.24.4
 ray==2.5.0
 raydp==1.6.0b20230527.dev0
 torchvision==0.14.1
 torch==1.13.1
-transformers==4.26.0
+transformers==4.31.0
 typing==3.7.4.3
 tabulate
 ray[tune]