From a3681ee859a2e7d2d63e21628a4369121a780f49 Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Mon, 15 Jul 2024 15:53:41 -0400 Subject: [PATCH] Update drop_duplicated to ensure the redundancy of index doesn't happen By reseting the index before converting the dataframe to dataset resolves the issue. Resolves : duplicate columns like` __index_level_0__`, this column gets added at each step and cause redundancy and confusion due to the same key name. Signed-off-by: Aakanksha Duggal --- src/instructlab/sdg/pipeline.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index f9ca8725..05044dc0 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -57,8 +57,9 @@ def _drop_duplicates(self, dataset, cols): Drop duplicates from the dataset based on the columns provided. """ df = dataset.to_pandas() - df.drop_duplicates(subset=cols, inplace=True) - return Dataset.from_pandas(df) + df = df.drop_duplicates(subset=cols).reset_index(drop=True) + ds = Dataset.from_pandas(df) + return ds def generate(self, dataset) -> Dataset: """