delphi-suite · jaidhyani · Apr 27, 2024 · Apr 25, 2024
diff --git a/scripts/inference.py b/scripts/inference.py
@@ -19,7 +19,6 @@ def main(
     batch_size: Int,
     dataset_name: str,
     username: str,
-    token: str,
     funct_test: bool = False,
 ):
     """
@@ -30,7 +29,6 @@ def main(
     - batch_size: The batch size for processing. 80 worked well in CPU.
     - dataset_name: The name of the dataset from which validation set will be loaded
     - username: Hugging Face API username
-    - token: Hugging Face API token
     """
     val_ds = load_validation_dataset(dataset_name)
 
@@ -63,7 +61,6 @@ def main(
         repo_id=repo_id,
         split="validation",
         private=False,
-        token=token,
     )
 
 
@@ -90,11 +87,6 @@ def main(
         type=str,
         help="Hugging Face API username",
     )
-    parser.add_argument(
-        "--token",
-        type=str,
-        help="Hugging Face API token",
-    )
     parser.add_argument(
         "--test-funct", action="store_true", help="Enable test function mode"
     )
@@ -109,6 +101,5 @@ def main(
         args.batch_size,
         args.dataset_name,
         args.username,
-        args.token,
         args.test_funct,
     )
diff --git a/scripts/run_training.py b/scripts/run_training.py
@@ -100,7 +100,7 @@ def main():
     # run training
     results, run_context = run_training(config)
     final_out_dir = os.path.join(config.output_dir, "final")
-    save_results(config, results, run_context, final_out_dir)
+    save_results(config, results, run_context, final_out_dir, final=True)
     print(f"Saved results to {final_out_dir}")
 
 

diff --git a/scripts/tokenize_dataset.py b/scripts/tokenize_dataset.py
@@ -52,12 +52,6 @@
         required=True,
         help="Context size of the tokenized dataset as input of the model",
     )
-    parser.add_argument(
-        "--hf-token",
-        "-t",
-        type=str,
-        help="Hugging Face API token",
-    )
     parser.add_argument(
         "--batch-size",
         "-b",
@@ -86,7 +80,7 @@
     assert tokenizer.bos_token_id is not None, "Tokenizer must have a bos_token_id"
     assert tokenizer.eos_token_id is not None, "Tokenizer must have a eos_token_id"
 
-    api = HfApi(token=args.hf_token)
+    api = HfApi()
     api.create_repo(repo_id=args.out_repo_id, repo_type="dataset", exist_ok=True)
     tokenize_and_upload_split(
         dataset_split=in_dataset_split,

diff --git a/scripts/train_tokenizer.py b/scripts/train_tokenizer.py
@@ -64,12 +64,6 @@ def train_byte_level_bpe(
         required=True,
         help="Where to push the resulting tokenizer",
     )
-    parser.add_argument(
-        "--hf-token",
-        "-t",
-        type=str,
-        help="Hugging Face API token",
-    )
     args = parser.parse_args()
 
     print(f"Loading dataset '{args.in_repo_id}'...")
@@ -86,5 +80,4 @@ def train_byte_level_bpe(
     )
     tokenizer.push_to_hub(
         repo_id=args.out_repo_id,
-        token=args.hf_token,
     )
diff --git a/src/delphi/test_configs/debug.json b/src/delphi/test_configs/debug.json
@@ -16,5 +16,6 @@
     },
     "dataset": {
         "name": "delphi-suite/v0-tinystories-v2-clean-tokenized"
-    }
+    },
+    "out_repo_id": ""
 }
diff --git a/src/delphi/test_configs/debug_transformers_bloom.json b/src/delphi/test_configs/debug_transformers_bloom.json
@@ -24,5 +24,6 @@
     "torch_seed": 1337,
     "dataset": {
         "name": "delphi-suite/v0-tinystories-v2-clean-tokenized"
-    }
+    },
+    "out_repo_id": ""
 }
diff --git a/src/delphi/test_configs/v0-llama2-100k.json b/src/delphi/test_configs/v0-llama2-100k.json
@@ -26,5 +26,6 @@
   "torch_seed": 1337,
   "dataset": {
     "name": "delphi-suite/v0-tinystories-v2-clean-tokenized"
-  }
+  },
+  "out_repo_id": ""
 }
diff --git a/src/delphi/train/config/huggingface_config.py b/src/delphi/train/config/huggingface_config.py
diff --git a/src/delphi/train/config/training_config.py b/src/delphi/train/config/training_config.py
@@ -9,7 +9,6 @@
 from .adam_config import AdamConfig
 from .dataset_config import DatasetConfig
 from .debug_config import DebugConfig
-from .huggingface_config import HuggingfaceConfig
 from .wandb_config import WandbConfig
 
 
@@ -83,6 +82,7 @@ class TrainingConfig:
         metadata={"help": "seed used for pseudorandomly sampling data during training"},
     )
     torch_seed: int = field(metadata={"help": "seed used for torch"})
+    save_optimizer: bool = True
 
     # data
     dataset: DatasetConfig = field(
@@ -91,7 +91,7 @@ class TrainingConfig:
 
     # third party
     wandb: Optional[WandbConfig] = None
-    hf: Optional[HuggingfaceConfig] = None
+    out_repo_id: str
 
     # debug
     debug_config: DebugConfig = field(default_factory=DebugConfig)
diff --git a/src/delphi/train/utils.py b/src/delphi/train/utils.py
@@ -239,6 +239,7 @@ def save_results(
     train_results: ModelTrainingState,
     run_context: RunContext,
     results_path: str,
+    final: bool = False,
 ):
     """
     save results to disk, and to huggingface if configured to do so.
@@ -247,21 +248,21 @@ def save_results(
     config, context (e.g. hardware), training step, etc
     """
     os.makedirs(results_path, exist_ok=True)
-    with open(os.path.join(results_path, "config.json"), "w") as file:
+    with open(os.path.join(results_path, "training_config.json"), "w") as file:
         json.dump(asdict(config), file, indent=2)
     model = train_results.model
     if isinstance(model, PreTrainedModel):
-        model = cast(PreTrainedModel, model)
         model.save_pretrained(
-            save_directory=os.path.join(results_path, "model"),
+            save_directory=results_path,
         )
     else:
         st.save_model(
-            train_results.model,
-            os.path.join(results_path, "model", "model.safetensors"),
+            model,
+            os.path.join(results_path, "model.safetensors"),
         )
-    with open(os.path.join(results_path, "opt.pt"), "wb") as f:
-        torch.save(train_results.optimizer.state_dict(), f)
+    if config.save_optimizer:
+        with open(os.path.join(results_path, "optimizer.pt"), "wb") as f:
+            torch.save(train_results.optimizer.state_dict(), f)
     with open(os.path.join(results_path, "training_state.json"), "w") as file:
         training_state_dict = {
             "iter_num": train_results.iter_num,
@@ -274,13 +275,22 @@ def save_results(
         run_context_dict = asdict(run_context)
         run_context_dict["device"] = str(run_context.device)
         json.dump(run_context_dict, file, indent=2)
-    if config.hf and config.hf.push_checkpoints_to_hub:
-        api = HfApi(token=config.hf.token)
+    if config.out_repo_id:
+        api = HfApi()
+        api.create_repo(config.out_repo_id, exist_ok=True)
+        branch_name = f"iter{train_results.iter_num}"
+        api.create_branch(config.out_repo_id, branch=branch_name)
         api.upload_folder(
             folder_path=results_path,
-            repo_id=str(config.hf.repo_id),
-            revision=f"iter_{train_results.iter_num}",
+            repo_id=config.out_repo_id,
+            revision=branch_name,
         )
+        if final:
+            api.upload_folder(
+                folder_path=results_path,
+                repo_id=config.out_repo_id,
+                revision="main",
+            )
 
 
 def count_tokens_so_far(config: TrainingConfig, mts: ModelTrainingState) -> int: