Merge pull request #7 from Trainy-ai/patchfix_fastchat_transformers

bugfix: bump fastchat and transformers to latest
Trainy-ai · Oct 26, 2023 · 37a765a · 37a765a
2 parents 2eff234 + 3417763
commit 37a765a
Show file tree

Hide file tree

Showing 10 changed files with 1,907 additions and 71 deletions.
diff --git a/README.md b/README.md
@@ -16,15 +16,14 @@ Follow the instructions here [to install Skypilot and provide cloud credentials]
 
 ```bash
 # create a fresh environment
-conda create -n "sky" python=3.10
+conda create -n "sky" python=3.10 
 conda activate sky
 
-# For Macs, macOS >= 10.15 is required to install SkyPilot. For Apple Silicon-based devices (e.g. Apple M1)
-pip uninstall grpcio; conda install -c conda-forge grpcio=1.43.0 --force-reinstall
-
-# install the skypilot cli and dependency, for the clouds you want, e.g. GCP
-pip install "skypilot[gcp] @ git+https://github.com/skypilot-org/skypilot.git" # for aws, skypilot[aws]
+# install llm-atc
+pip install llm-atc
 
+# For Macs, macOS >= 10.15 has a conflict with grpcio
+pip uninstall grpcio; conda install -c conda-forge grpcio=1.43.0 --force-reinstall
 
 # Configure your cloud credentials. This is a GCP example. See https://skypilot.readthedocs.io/en/latest/getting-started/ installation.html for examples with other cloud providers.
 pip install google-api-python-client

diff --git a/docs/source/quickstart/finetuning.rst b/docs/source/quickstart/finetuning.rst
@@ -14,12 +14,12 @@ To do a vicuna finetune of your first model through LLM-ATC, run the following
 .. code-block:: console
 
     # start training
-    $ llm-atc train --model_type vicuna --finetune_data ./vicuna_test.json --name myvicuna --checkpoint_bucket my-trainy-bucket --checkpoint_path ~/test_vicuna --checkpoint_store S3 --description "This is a finetuned model that just says its name is vicuna" -c mycluster --cloud gcp --envs "MODEL_SIZE=7 WANDB_API_KEY=<my wandb key>" --accelerator A100-80G:4
+    $ llm-atc train --model_type vicuna --finetune_data ./vicuna_test.json --name myvicuna --checkpoint_bucket my-trainy-bucket --checkpoint_store S3 --description "This is a finetuned model that just says its name is vicuna" -c mycluster --cloud gcp --envs "MODEL_BASE='meta-llama/Llama-2-7b-hf' HF_TOKEN=<huggingface_token> WANDB_API_KEY=<wandb_key" --accelerator A100:8 --region asia-southeast1
 
     # Once training is done, shutdown the cluster
     $ sky down
 
-In this example, :code:`llm-atc train` requests a single :code:`A100-80G:4` instance from GCP. 
+In this example, :code:`llm-atc train` requests a single :code:`A100:8` instance from GCP. 
 If there is availability, a GPU instance is allocated and your finetuning data is
 uploaded from your laptop to the instance as training data for finetuning. URIs to 
 object stores will also work. For example,

diff --git a/docs/source/quickstart/installation.rst b/docs/source/quickstart/installation.rst
@@ -9,15 +9,12 @@ We recommend installing LLM-ATC and the `Skypilot dependency <https://skypilot.r
     $ conda create -n "sky" python=3.10 
     $ conda activate sky
 
-    # For Macs, macOS >= 10.15 is required to install SkyPilot. For Apple Silicon-based devices (e.g. Apple M1)
-    $ pip uninstall grpcio; conda install -c conda-forge grpcio=1.43.0 --force-reinstall
-
-    # install the skypilot cli and dependency, for the clouds you want, e.g. AWS, GCP
-    $ pip install "skypilot[aws,gcp] @ git+https://github.com/skypilot-org/skypilot.git"
-
     # install llm-atc
     $ pip install llm-atc
 
+    # For Macs, macOS >= 10.15 has a conflict with grpcio
+    $ pip uninstall grpcio; conda install -c conda-forge grpcio=1.43.0 --force-reinstall
+
 
 Installation from Source
 ------------------------

diff --git a/llm_atc/cli.py b/llm_atc/cli.py
@@ -61,12 +61,6 @@ def cli():
 @click.option(
     "--checkpoint_bucket", type=str, required=True, help="object store bucket name"
 )
-@click.option(
-    "--checkpoint_path",
-    type=str,
-    required=True,
-    help="object store path for fine tuned checkpoints, e.g. ~/datasets",
-)
 @click.option(
     "--checkpoint_store",
     type=str,
@@ -94,6 +88,10 @@ def cli():
     type=str,
     help="Environment variables for run. Usage `llm-atc train ... --envs 'MODEL_SIZE=7 USE_FLASH_ATTN=0 WANDB_API_KEY=<mywanbd_key>'`",
 )
+@click.option(
+    "--region", type=str, help="which region to train in. Defaults to any region"
+)
+@click.option("--zone", type=str, help="which zone to train in. Defaults to any zone")
 @click.option("--accelerator", type=str, help="Which GPU type to use", required=True)
 @click.option(
     "--detach_setup",
@@ -116,13 +114,14 @@ def train(
     model_type: str,
     finetune_data: str,
     checkpoint_bucket: str,
-    checkpoint_path: str,
     checkpoint_store: Optional[str],
     name: str,
     description: str,
     cluster: Optional[str],
     cloud: Optional[str],
     envs: Optional[str],
+    region: Optional[str],
+    zone: Optional[str],
     accelerator: Optional[str],
     detach_setup: Optional[bool],
     detach_run: Optional[bool],
@@ -139,13 +138,14 @@ def train(
     task = train_task(
         model_type,
         checkpoint_bucket=checkpoint_bucket,
-        checkpoint_path=checkpoint_path,
         checkpoint_store=checkpoint_store,
         finetune_data=finetune_data,
         name=name,
         cloud=cloud,
         accelerator=accelerator,
         envs=envs,
+        region=region,
+        zone=zone,
     )
     RunTracker.add_run(model_type, name, description, task)
     sky.launch(

diff --git a/llm_atc/config/train/vicuna.yml b/llm_atc/config/train/vicuna.yml
@@ -20,9 +20,9 @@ setup: |
   fi
 
   # Install huggingface with the LLaMA commit
+  pip install protobuf
   git clone https://github.com/huggingface/transformers.git
   cd transformers
-  git checkout 41a2f3529c6b56866c317031375ffd3e7b8bea01
   pip install .
   cd -
 
@@ -31,7 +31,7 @@ setup: |
   pip install sentencepiece
 
   # Install pytorch
-  pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
+  pip install torch==2.0.1 --extra-index-url https://download.pytorch.org/whl/cu116
   git clone https://github.com/lm-sys/FastChat.git
   cd FastChat
   pip install -e .
@@ -44,6 +44,12 @@ setup: |
   sudo apt update
   sudo apt install -y rclone
 
+  if [[ "$HF_TOKEN" != "" ]];
+  then
+    pip install --upgrade huggingface_hub
+    huggingface-cli login --token $HF_TOKEN
+  fi
+
 run: |
   cd FastChat
   conda activate chatbot
@@ -58,39 +64,30 @@ run: |
   PER_DEVICE_BATCH_SIZE=$((2048 * $GC_SCALE / $SEQ_LEN))
   NUM_NODES=`echo "$SKYPILOT_NODE_IPS" | wc -l`
   HOST_ADDR=`echo "$SKYPILOT_NODE_IPS" | head -n1`
-
-  # Do the periodic syncing manually, to avoid the degradation of
-  # the training for saving checkpoints.
-  mkdir -p ~/.checkpoints
-  LOCAL_CKPT_PATH=~/.checkpoints
-  CKPT_PATH=/artifacts/${BUCKET_PATH}/${MODEL_NAME}
-  mkdir -p $CKPT_PATH
-  last_ckpt=$(ls ${CKPT_PATH} | grep -E '[0-9]+' | sort -t'-' -k1,1 -k2,2n | tail -1)
-  mkdir -p ~/.checkpoints/${last_ckpt}
-  rclone copyto --progress ${CKPT_PATH}/${last_ckpt}/ ~/.checkpoints/${last_ckpt}
   
   # Turn off wandb if no api key is provided
   if [ "$WANDB_API_KEY" == "" ]; then
     export WANDB_MODE="offline"
   fi
 
-  if [[ "$HF_TOKEN" != "" ]];
-  then
-    pip install --upgrade huggingface_hub
-    huggingface-cli login --token $HF_TOKEN
+  # use LlamaDecoderLayer if using Llama model
+  if [[ $MODEL_BASE == *"llama"* ]]; then
+    LLAMA_LAYER="--fsdp_transformer_layer_cls_to_wrap LlamaDecoderLayer"
+  else
+    LLAMA_LAYER=""
   fi
-  
+
   torchrun \
     --nnodes=$NUM_NODES \
     --nproc_per_node=$SKYPILOT_NUM_GPUS_PER_NODE \
     --master_port=12375 \
     --master_addr=$HOST_ADDR \
     --node_rank=${SKYPILOT_NODE_RANK} \
     $TRAIN_SCRIPT \
-    --model_name_or_path huggyllama/llama-${MODEL_SIZE}b \
+    --model_name_or_path ${MODEL_BASE} \
     --data_path /data/mydata.json \
     --bf16 True \
-    --output_dir $LOCAL_CKPT_PATH \
+    --output_dir /artifacts/${MODEL_NAME} \
     --num_train_epochs 3 \
     --per_device_train_batch_size $PER_DEVICE_BATCH_SIZE \
     --per_device_eval_batch_size $PER_DEVICE_BATCH_SIZE \
@@ -105,7 +102,7 @@ run: |
     --lr_scheduler_type "cosine" \
     --logging_steps 1 \
     --fsdp "full_shard auto_wrap" \
-    --fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \
+    ${LLAMA_LAYER} \
     --tf32 True \
     --model_max_length ${SEQ_LEN} \
     --run_name $SKYPILOT_JOB_ID \
@@ -114,18 +111,16 @@ run: |
 
   returncode=$?
   # Sync any files not in the checkpoint-* folders
-  rclone sync --progress $LOCAL_CKPT_PATH/ $CKPT_PATH/
   exit $returncode
 
 
 envs:
-  MODEL_SIZE: 7
+  MODEL_BASE: meta-llama/Llama-2-7b-hf
   SEQ_LEN: 2048
   GC_SCALE: 4
   USE_FLASH_ATTN: 0
   WANDB_API_KEY: ""
   MODEL_NAME: "vicuna_test"
   HF_TOKEN: ""
   MY_BUCKET: "llm-atc"
-  BUCKET_PATH: "my_vicuna" # object store path.
   BUCKET_TYPE: "S3"
diff --git a/llm_atc/launch.py b/llm_atc/launch.py
@@ -41,18 +41,16 @@ def __init__(
         self,
         finetune_data: str,
         checkpoint_bucket: str = "llm-atc",
-        checkpoint_path: str = "my_vicuna",
         checkpoint_store: str = "S3",
         name: Optional[str] = None,
         cloud: Optional[str] = None,
-        region: Optional[str] = None,
-        zone: Optional[str] = None,
         accelerator: Optional[str] = None,
         envs: Optional[str] = "",
+        region: Optional[str] = None,
+        zone: Optional[str] = None,
     ):
         self.finetune_data: str = finetune_data
         self.checkpoint_bucket: str = checkpoint_bucket
-        self.checkpoint_path: str = checkpoint_path
         self.checkpoint_store: str = checkpoint_store
         self.name: Optional[str] = name
         self.cloud: Optional[str] = cloud
@@ -82,18 +80,17 @@ def launch(self) -> sky.Task:
         task = self.default_task
         task.name = self.name
         self.envs["MODEL_NAME"] = self.name
-        if "MODEL_SIZE" not in self.envs:
+        if "MODEL_BASE" not in self.envs:
             logging.warning(
-                f"envs.MODEL_SIZE not set, defaulting to {task.envs['MODEL_SIZE']}"
+                f"envs.MODEL_BASE not set, defaulting to {task.envs['MODEL_BASE']}"
             )
         if "WANDB_API_KEY" not in self.envs:
             logging.warning(f"envs.WANDB_API_KEY not set, skipping WandB logging")
         if "HF_TOKEN" not in self.envs:
-            logging.warning(
-                "No huggingface token provided. You will not be able to finetune starting from private or gated models"
+            raise ValueError(
+                "No huggingface access token provided. You will not be able to finetune starting from Llama2"
             )
         self.envs["MY_BUCKET"] = self.checkpoint_bucket
-        self.envs["BUCKET_PATH"] = self.checkpoint_path
         self.envs["BUCKET_TYPE"] = self.checkpoint_store
         task.update_envs(self.envs)
         task.update_file_mounts({"/data/mydata.json": self.finetune_data})