diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index d1cc1b910f29..906aa7dead31 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -3917,6 +3917,17 @@ jobs:
           --experiment-dir=/tmp/mixtral_pretrain_results \
           --data-path=/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document
 
+  L2_HF_Transformers_peft_test:
+    needs: [ cicd-test-container-setup ]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_HF_Transformers_peft_test') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python examples/llm/peft/hf.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10
+      AFTER_SCRIPT: |
+        rm -rf nemo_experiments
+
   L2_NeMo_2_GPT_SFT_TP1PP1_MBS1:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -4469,6 +4480,7 @@ jobs:
       - L2_NeMo_2_Mistral_LoRA_TP1PP1_MBS1
       - L2_NeMo_2_Mistral_LoRA_TP2PP1_MBS1
       - L2_NeMo_2_Mixtral_Pretraining
+      - L2_HF_Transformers_peft_test
       - L2_PTQ_Llama2_FP8
       - L2_Community_LLM_Checkpoints_tests_Llama3
       - L2_Distill_Llama2
diff --git a/examples/llm/peft/hf.py b/examples/llm/peft/hf.py
index 5b24c22ab79d..00e90a52bb0a 100644
--- a/examples/llm/peft/hf.py
+++ b/examples/llm/peft/hf.py
@@ -41,14 +41,15 @@ def formatting_prompts_func(examples):
         ans = tokenizer(text)
         tokens = ans['input_ids']
         return {
-            'tokens': tokens,
+            'input_ids': tokens,
             'labels': tokens[1:] + [tokens[-1]],
         }
 
     from datasets import load_dataset
 
     dataset = load_dataset("rajpurkar/squad", split="train")
-    dataset = dataset.map(formatting_prompts_func, batched=False, batch_size=2)
+    columns_to_remove = list(filter(lambda x: x not in ['input_ids', 'labels'], dataset.features.keys()))
+    dataset = dataset.map(formatting_prompts_func, batched=False, batch_size=2, remove_columns=columns_to_remove)
     return dataset
 
 
diff --git a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
index 6922f38cfb26..d8d1917c4427 100644
--- a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
+++ b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Optional
+
 import lightning.pytorch as pl
 import torch
 import torch.nn.functional as F
@@ -82,41 +84,31 @@ def configure_model(self):
             self.model = AutoModelForCausalLM.from_config(config, trust_remote_code=self.trust_remote_code)
         self.model.train()
 
-    def forward(self, input_ids, attention_mask=None, labels=None, loss_mask=None):
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: torch.Tensor = None,
+        labels: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
         outputs = self.model(
             input_ids=input_ids.to(self.model.device),
-            attention_mask=attention_mask,
+            attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else attention_mask,
         )
         labels = labels.to(self.model.device)
         if loss_mask is not None:
             loss_mask = loss_mask.to(self.model.device).view(-1)
         n_cls = outputs.logits.shape[-1]
-        outputs.loss = self.loss_fn(outputs.logits.view(-1, n_cls), labels.view(-1), loss_mask)
-        return outputs
+        return self.loss_fn(outputs.logits.view(-1, n_cls), labels.view(-1), loss_mask)
 
     def training_step(self, batch):
-        tokens = batch['tokens']
-        labels = batch['labels']
-        loss_mask = batch.get('loss_mask', None)
-        output = self.forward(
-            input_ids=tokens,
-            labels=labels,
-            loss_mask=loss_mask,
-        )
-
-        loss = output.loss
+        loss = self.forward(**batch)
         self.log('train_log', loss, on_step=True, on_epoch=True, prog_bar=True)
         return loss
 
     def validation_step(self, batch, batch_idx):
-        tokens = batch['tokens']
-        labels = batch['labels']
-        output = self.forward(
-            input_ids=tokens,
-            labels=labels,
-        )
-
-        loss = output.loss
+        loss = self.forward(**batch)
         self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
 
     def save_pretrained(self, path):