From a3681ee859a2e7d2d63e21628a4369121a780f49 Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Mon, 15 Jul 2024 15:53:41 -0400
Subject: [PATCH] Update drop_duplicated to ensure the redundancy of index
 doesn't happen

By reseting the index before converting the dataframe to dataset resolves the issue.

Resolves : duplicate columns like` __index_level_0__`, this column gets added at each step and cause redundancy and confusion due to the same key name.

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
---
 src/instructlab/sdg/pipeline.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py
index f9ca8725..05044dc0 100644
--- a/src/instructlab/sdg/pipeline.py
+++ b/src/instructlab/sdg/pipeline.py
@@ -57,8 +57,9 @@ def _drop_duplicates(self, dataset, cols):
         Drop duplicates from the dataset based on the columns provided.
         """
         df = dataset.to_pandas()
-        df.drop_duplicates(subset=cols, inplace=True)
-        return Dataset.from_pandas(df)
+        df = df.drop_duplicates(subset=cols).reset_index(drop=True)
+        ds = Dataset.from_pandas(df)
+        return ds
 
     def generate(self, dataset) -> Dataset:
         """